From 7610047be9073cbce59c2f44b09bbc1bfdb46879 Mon Sep 17 00:00:00 2001 From: Marco van den Boom Date: Thu, 25 Jan 2024 10:44:34 +0100 Subject: [PATCH 1/2] drop add_traces --- atom/plots/baseplot.py | 11 +- atom/plots/dataplot.py | 478 +++++++++------ atom/plots/hyperparametertuningplot.py | 302 +++++---- atom/plots/predictionplot.py | 806 ++++++++++++------------- 4 files changed, 802 insertions(+), 795 deletions(-) diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index b19a0e58d..faa5fbb72 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -392,8 +392,8 @@ def _draw_line( child: str | None = None, legend: Legend | dict[str, Any] | None = None, **kwargs, - ) -> go.Scatter: - """Draw a line. + ): + """Draw a line on the current figure. Unify the style to draw a line, where parent and child (e.g., model - data set or column - distribution) keep the @@ -414,13 +414,8 @@ def _draw_line( **kwargs Additional keyword arguments for the trace. - Returns - ------- - go.Scatter - New trace to add to figure. - """ - return go.Scatter( + Baseplot._fig.figure.add_scatter( line=kwargs.pop( "line", { "width": self.line_width, diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index 2b2ce8328..de26e44b5 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -10,9 +10,10 @@ from abc import ABCMeta from pathlib import Path from typing import Any, Literal - +from statsmodels.tsa.stattools import pacf import numpy as np import pandas as pd +from sklearn.utils.metaestimators import available_if import plotly.graph_objects as go from beartype import beartype from nltk.collocations import ( @@ -29,7 +30,7 @@ Segment, Sequence, Series, ) from atom.utils.utils import ( - check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, + check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, has_task ) @@ -136,22 +137,20 @@ def plot_components( color = BasePlot._fig.get_elem("components") opacity = [0.2] * self.pca_._comps + [0] * (len(variance) - self.pca_._comps) - fig.add_trace( - go.Bar( - x=variance, - y=[f"pca{i}" for i in range(len(variance))], - orientation="h", - marker={ - "color": [f"rgba({color[4:-1]}, {o})" for o in opacity], - "line": {"width": 2, "color": color}, - }, - hovertemplate="%{x}", - name=f"Variance retained: {variance[:self.pca_._comps].sum():.3f}", - legendgroup="components", - showlegend=BasePlot._fig.showlegend("components", legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=variance, + y=[f"pca{i}" for i in range(len(variance))], + orientation="h", + marker={ + "color": [f"rgba({color[4:-1]}, {o})" for o in opacity], + "line": {"width": 2, "color": color}, + }, + hovertemplate="%{x}", + name=f"Variance retained: {variance[:self.pca_._comps].sum():.3f}", + legendgroup="components", + showlegend=BasePlot._fig.showlegend("components", legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout({f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"}}) @@ -264,18 +263,16 @@ def plot_correlation( }, ) - fig.add_trace( - go.Heatmap( - z=corr.mask(mask), - x=columns_c, - y=columns_c, - coloraxis=f"coloraxis{xaxis[1:]}", - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - hoverongaps=False, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_heatmap( + z=corr.mask(mask), + x=columns_c, + y=columns_c, + coloraxis=f"coloraxis{xaxis[1:]}", + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + hoverongaps=False, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -412,21 +409,19 @@ def plot_distribution( show_c = self._get_show(show, len(series)) color = BasePlot._fig.get_elem() - fig.add_trace( - go.Bar( - x=series, - y=series.index, - orientation="h", - marker={ - "color": f"rgba({color[4:-1]}, 0.2)", - "line": {"width": 2, "color": color}, - }, - hovertemplate="%{x}", - name=f"{columns_c[0]}: {len(series)} classes", - showlegend=BasePlot._fig.showlegend("dist", legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=series, + y=series.index, + orientation="h", + marker={ + "color": f"rgba({color[4:-1]}, 0.2)", + "line": {"width": 2, "color": color}, + }, + hovertemplate="%{x}", + name=f"{columns_c[0]}: {len(series)} classes", + showlegend=BasePlot._fig.showlegend("dist", legend), + xaxis=xaxis, + yaxis=yaxis, ) return self._plot( @@ -443,22 +438,20 @@ def plot_distribution( else: for col in [c for c in columns_c if c in num_columns]: - fig.add_trace( - go.Histogram( - x=self.branch.dataset[col], - histnorm="probability density", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(col)}, - }, - nbinsx=40, - name="dist", - legendgroup=col, - legendgrouptitle={"text": col, "font_size": self.label_fontsize}, - showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_histogram( + x=self.branch.dataset[col], + histnorm="probability density", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(col)}, + }, + nbinsx=40, + name="dist", + legendgroup=col, + legendgrouptitle={"text": col, "font_size": self.label_fontsize}, + showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend), + xaxis=xaxis, + yaxis=yaxis, ) x = np.linspace( @@ -480,16 +473,14 @@ def plot_distribution( params = getattr(stats, dist).fit(values) y = getattr(stats, dist).pdf(x, *params) - fig.add_trace( - self._draw_line( - x=x, - y=y, - parent=col, - child=dist, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=x, + y=y, + parent=col, + child=dist, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout({"barmode": "overlay"}) @@ -650,22 +641,20 @@ def get_text(column: Series) -> Series: fig = self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() - fig.add_trace( - go.Bar( - x=(data := series[-self._get_show(show, len(series)):]), - y=data.index, - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(ngram_c)}, - }, - hovertemplate="%{x}", - name=f"Total {ngram_c}: {len(series)}", - legendgroup=ngram_c, - showlegend=BasePlot._fig.showlegend(ngram_c, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=(data := series[-self._get_show(show, len(series)):]), + y=data.index, + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(ngram_c)}, + }, + hovertemplate="%{x}", + name=f"Total {ngram_c}: {len(series)}", + legendgroup=ngram_c, + showlegend=BasePlot._fig.showlegend(ngram_c, legend), + xaxis=xaxis, + yaxis=yaxis, ) return self._plot( @@ -679,6 +668,131 @@ def get_text(column: Series) -> Series: display=display, ) + @available_if(has_task("forecast")) + @crash + def plot_pacf( + self, + columns: ColumnSelector | None = None, + show: IntLargerZero | None = 10, + *, + title: str | dict[str, Any] | None = None, + legend: Legend | dict[str, Any] | None = "lower right", + figsize: tuple[IntLargerZero, IntLargerZero] | None = None, + filename: str | Path | None = None, + display: Bool | None = True, + ) -> go.Figure | None: + """Plot the partial autocorrelation function. + + Missing values are ignored. + + !!! tip + Use atom's [decompose][atomforecaster-decompose] method to + remove trend and seasonality from the data. + + Parameters + ---------- + columns: int, str, segment, sequence, dataframe or None, default=None + Columns to plot the pacf from. If None, it selects the + target column. + + show: int or None, default=10 + Number of n-grams (ordered by number of occurrences) to + show in the plot. If none, show all n-grams (up to 200). + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of n-grams shown. + + filename: str, Path or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_acf + atom.plots:DataPlot.plot_decomposition + atom.plots:DataPlot.plot_ttf + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.plot_pacf() + ``` + + """ + if columns is None: + columns_c = lst(self.branch.target) + else: + columns_c = self.branch._get_columns(columns) + show_c = self._get_show(show) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + for col in columns_c: + corr_array = pacf(self.branch.dataset[col].dropna(), nlags=10, alpha=0.05) + + lower_y = corr_array[1][:, 0] - corr_array[0] + upper_y = corr_array[1][:, 1] - corr_array[0] + + for x in range(len(corr_array[0])): + fig.add_scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', line_color='#3f3f3f', xaxis=xaxis, yaxis=yaxis) + + fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', + marker_color='#1f77b4', + marker_size=12, xaxis=xaxis, yaxis=yaxis) + fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', + line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis) + fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines', + fillcolor='rgba(32, 146, 230,0.3)', + fill='tonexty', line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis) + + fig.update_traces(showlegend=False) + # fig.update_xaxes(range=[-1, 42]) + fig.update_yaxes(zerolinecolor="black") + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Lag", + title=title, + legend=legend, + figsize=figsize or (900, 400 + show_c * 50), + plotname="plot_pacf", + filename=filename, + display=display, + ) + @crash def plot_pca( self, @@ -761,23 +875,21 @@ def plot_pca( fig = self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() - fig.add_trace( - go.Scatter( - x=tuple(range(1, self.pca_.n_features_in_ + 1)), - y=np.cumsum(self.pca_.explained_variance_ratio_), - mode="lines+markers", - line={"width": self.line_width, "color": BasePlot._fig.get_elem("pca")}, - marker={ - "symbol": symbols, - "size": sizes, - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - "opacity": 1, - }, - hovertemplate="%{y}", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=tuple(range(1, self.pca_.n_features_in_ + 1)), + y=np.cumsum(self.pca_.explained_variance_ratio_), + mode="lines+markers", + line={"width": self.line_width, "color": BasePlot._fig.get_elem("pca")}, + marker={ + "symbol": symbols, + "size": sizes, + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + "opacity": 1, + }, + hovertemplate="%{y}", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -821,7 +933,7 @@ def plot_qq( Parameters ---------- - columns: int, str, slice or sequence, default=0 + columns: int, str, segment, sequence or dataframe, default=0 Columns to plot. Selected categorical columns are ignored. distributions: str or sequence, default="norm" @@ -896,17 +1008,15 @@ def plot_qq( params = stat.fit(values) samples = stat.rvs(*params, size=101, random_state=self.random_state) - fig.add_trace( - self._draw_line( - x=(x := np.percentile(samples, percentiles)), - y=(y := np.percentile(values, percentiles)), - mode="markers", - parent=col, - child=dist, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=(x := np.percentile(samples, percentiles)), + y=(y := np.percentile(values, percentiles)), + mode="markers", + parent=col, + child=dist, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis) @@ -1027,43 +1137,37 @@ def plot_relationships( ) if x == y: - fig.add_trace( - go.Histogram( - x=self.branch.dataset[columns_c[x]], - marker={ - "color": f"rgba({color[4:-1]}, 0.2)", - "line": {"width": 2, "color": color}, - }, - name=columns_c[x], - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_histogram( + x=self.branch.dataset[columns_c[x]], + marker={ + "color": f"rgba({color[4:-1]}, 0.2)", + "line": {"width": 2, "color": color}, + }, + name=columns_c[x], + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) elif x > y: - fig.add_trace( - go.Scatter( - x=sample(columns_c[y]), - y=sample(columns_c[x]), - mode="markers", - marker={"color": color}, - hovertemplate="(%{x}, %{y})", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=sample(columns_c[y]), + y=sample(columns_c[x]), + mode="markers", + marker={"color": color}, + hovertemplate="(%{x}, %{y})", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) elif y > x: - fig.add_trace( - go.Histogram2dContour( - x=self.branch.dataset[columns_c[y]], - y=self.branch.dataset[columns_c[x]], - coloraxis=f"coloraxis{xaxis[1:]}", - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_histogram2dcontour( + x=self.branch.dataset[columns_c[y]], + y=self.branch.dataset[columns_c[x]], + coloraxis=f"coloraxis{xaxis[1:]}", + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) if x < len(columns_c) - 1: @@ -1181,24 +1285,22 @@ def plot_rfecv( mean = self.rfecv_.cv_results_["mean_test_score"] std = self.rfecv_.cv_results_["std_test_score"] - fig.add_trace( - go.Scatter( - x=list(x), - y=mean, - mode="lines+markers", - line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")}, - marker={ - "symbol": symbols, - "size": sizes, - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - "opacity": 1, - }, - name=ylabel, - legendgroup="rfecv", - showlegend=BasePlot._fig.showlegend("rfecv", legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=list(x), + y=mean, + mode="lines+markers", + line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")}, + marker={ + "symbol": symbols, + "size": sizes, + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + "opacity": 1, + }, + name=ylabel, + legendgroup="rfecv", + showlegend=BasePlot._fig.showlegend("rfecv", legend), + xaxis=xaxis, + yaxis=yaxis, ) # Add error bands @@ -1340,22 +1442,20 @@ def plot_series( for col in columns_c: for child, ds in self._get_set(rows): - fig.add_trace( - self._draw_line( - x=self._get_plot_index(y := self.branch._get_rows(ds)[col]), - y=y, - mode="lines+markers", - marker={ - "size": self.marker_size, - "color": BasePlot._fig.get_elem(col), - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - }, - parent=col, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=self._get_plot_index(y := self.branch._get_rows(ds)[col]), + y=y, + mode="lines+markers", + marker={ + "size": self.marker_size, + "color": BasePlot._fig.get_elem(col), + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + }, + parent=col, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) return self._plot( @@ -1476,13 +1576,11 @@ def get_text(column): fig = self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() - fig.add_trace( - go.Image( - z=wordcloud.generate(get_text(rows_c[corpus])), - hoverinfo="skip", - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_image( + z=wordcloud.generate(get_text(rows_c[corpus])), + hoverinfo="skip", + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py index dd799aab2..e520a6e03 100644 --- a/atom/plots/hyperparametertuningplot.py +++ b/atom/plots/hyperparametertuningplot.py @@ -254,16 +254,14 @@ def plot_edf( for m in models_c: for met in metric_c: y = np.sum(m.trials[met].values[:, np.newaxis] <= x, axis=0) - fig.add_trace( - self._draw_line( - x=x, - y=y / len(m.trials), - parent=m.name, - child=met, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=x, + y=y / len(m.trials), + parent=m.name, + child=met, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) @@ -379,22 +377,20 @@ def plot_hyperparameter_importance( fanova = FanovaImportanceEvaluator(seed=self.random_state) importances = fanova.evaluate(m.study, target=self._optuna_target(metric_c)) - fig.add_trace( - go.Bar( - x=np.array(list(importances.values())) / sum(importances.values()), - y=list(importances), - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, - }, - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=np.array(list(importances.values())) / sum(importances.values()), + y=list(importances), + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, + }, + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -534,46 +530,42 @@ def plot_hyperparameters( }, ) - fig.add_trace( - go.Scatter( - x=model.trials[params_c[y]], - y=model.trials[params_c[x + 1]], - mode="markers", - marker={ - "size": self.marker_size, - "color": BasePlot._fig.get_elem(model.name), - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - }, - customdata=list( - zip(model.trials.index, model.trials[metric_c], strict=True) - ), - hovertemplate=( - f"{params_c[y]}:%{{x}}
" - f"{params_c[x + 1]}:%{{y}}
" - f"{metric_c}:%{{customdata[1]:.4f}}" - "Trial %{customdata[0]}" - ), - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=model.trials[params_c[y]], + y=model.trials[params_c[x + 1]], + mode="markers", + marker={ + "size": self.marker_size, + "color": BasePlot._fig.get_elem(model.name), + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + }, + customdata=list( + zip(model.trials.index, model.trials[metric_c], strict=True) + ), + hovertemplate=( + f"{params_c[y]}:%{{x}}
" + f"{params_c[x + 1]}:%{{y}}
" + f"{metric_c}:%{{customdata[1]:.4f}}" + "Trial %{customdata[0]}" + ), + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) - fig.add_trace( - go.Contour( - x=model.trials[params_c[y]], - y=model.trials[params_c[x + 1]], - z=model.trials[metric_c], - contours={ - "showlabels": True, - "labelfont": {"size": self.tick_fontsize, "color": "white"}, - }, - coloraxis="coloraxis99", - hoverinfo="skip", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_contour( + x=model.trials[params_c[y]], + y=model.trials[params_c[x + 1]], + z=model.trials[metric_c], + contours={ + "showlabels": True, + "labelfont": {"size": self.tick_fontsize, "color": "white"}, + }, + coloraxis="coloraxis99", + hoverinfo="skip", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) if _is_log_scale(model.study.trials, params_c[y]): @@ -762,17 +754,15 @@ def sort_mixed_types(values: list[str]) -> list[str]: } ) - fig.add_trace( - go.Parcoords( - dimensions=dims, - line={ - "color": dims[0]["values"], - "coloraxis": f"coloraxis{xaxis[1:]}", - }, - unselected={"line": {"color": "gray", "opacity": 0.5}}, - labelside="bottom", - labelfont={"size": self.label_fontsize}, - ) + fig.add_parcoords( + dimensions=dims, + line={ + "color": dims[0]["values"], + "coloraxis": f"coloraxis{xaxis[1:]}", + }, + unselected={"line": {"color": "gray", "opacity": 0.5}}, + labelside="bottom", + labelfont={"size": self.label_fontsize}, ) BasePlot._fig.used_models.append(model) @@ -904,22 +894,20 @@ def plot_pareto_front( y=(y_pos, rnd(y_pos + size)), ) - fig.add_trace( - go.Scatter( - x=model.trials[metric_c[y]], - y=model.trials[metric_c[x + 1]], - mode="markers", - marker={ - "size": self.marker_size, - "color": model.trials.index, - "colorscale": "Teal", - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - }, - customdata=model.trials.index, - hovertemplate="(%{x}, %{y})Trial %{customdata}", - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=model.trials[metric_c[y]], + y=model.trials[metric_c[x + 1]], + mode="markers", + marker={ + "size": self.marker_size, + "color": model.trials.index, + "colorscale": "Teal", + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + }, + customdata=model.trials.index, + hovertemplate="(%{x}, %{y})Trial %{customdata}", + xaxis=xaxis, + yaxis=yaxis, ) if x < length - 1: @@ -1057,22 +1045,20 @@ def plot_slice( y=(y_pos, rnd(y_pos + y_size)), ) - fig.add_trace( - go.Scatter( - x=model.trials[params_c[y]], - y=model.trials[metric_c[x]], - mode="markers", - marker={ - "size": self.marker_size, - "color": model.trials.index, - "colorscale": "Teal", - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - }, - customdata=model.trials.index, - hovertemplate="(%{x}, %{y})Trial %{customdata}", - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=model.trials[params_c[y]], + y=model.trials[metric_c[x]], + mode="markers", + marker={ + "size": self.marker_size, + "color": model.trials.index, + "colorscale": "Teal", + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + }, + customdata=model.trials.index, + hovertemplate="(%{x}, %{y})Trial %{customdata}", + xaxis=xaxis, + yaxis=yaxis, ) if _is_log_scale(model.study.trials, params_c[y]): @@ -1205,17 +1191,15 @@ def plot_terminator_improvement( "(e.g., using ht_params={'cv': 5}) on a single-metric optimization." ) - fig.add_trace( - self._draw_line( - x=m.trials.index, - y=info.improvements, - error_y={"type": "data", "array": info.errors}, - mode="markers+lines", - parent=m.name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=m.trials.index, + y=info.improvements, + error_y={"type": "data", "array": info.errors}, + mode="markers+lines", + parent=m.name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) @@ -1350,24 +1334,22 @@ def plot_timeline( for state in sorted(TrialState, key=lambda x: x.name): if bars := list(filter(lambda x: x.state == state, info)): - fig.add_trace( - go.Bar( - name=state.name, - x=[b.duration for b in bars], - y=[b.number for b in bars], - base=[b.start.isoformat() for b in bars], - text=[b.hovertext for b in bars], - textposition="none", - hovertemplate=f"%{{text}}{m.name}", - orientation="h", - marker={ - "color": f"rgba({_cm[state.name][4:-1]}, 0.2)", - "line": {"width": 2, "color": _cm[state.name]}, - }, - showlegend=BasePlot._fig.showlegend(_cm[state.name], legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + name=state.name, + x=[b.duration for b in bars], + y=[b.number for b in bars], + base=[b.start.isoformat() for b in bars], + text=[b.hovertext for b in bars], + textposition="none", + hovertemplate=f"%{{text}}{m.name}", + orientation="h", + marker={ + "color": f"rgba({_cm[state.name][4:-1]}, 0.2)", + "line": {"width": 2, "color": _cm[state.name]}, + }, + showlegend=BasePlot._fig.showlegend(_cm[state.name], legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"}) @@ -1485,34 +1467,30 @@ def plot_trials( sizes = [self.marker_size] * len(m.trials) sizes[m.best_trial.number] = self.marker_size * 1.5 - fig.add_trace( - self._draw_line( - x=m.trials.index, - y=m.trials[met], - mode="lines+markers", - marker_symbol=symbols, - marker_size=sizes, - hovertemplate=None, - parent=m.name, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis2, - yaxis=yaxis, - ) + self._draw_line( + x=m.trials.index, + y=m.trials[met], + mode="lines+markers", + marker_symbol=symbols, + marker_size=sizes, + hovertemplate=None, + parent=m.name, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis2, + yaxis=yaxis, ) - fig.add_trace( - self._draw_line( - x=m.trials.index, - y=m.trials[met].diff(), - mode="lines+markers", - marker_symbol="circle", - parent=m.name, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis2, - yaxis=yaxis2, - ) + self._draw_line( + x=m.trials.index, + y=m.trials[met].diff(), + mode="lines+markers", + marker_symbol="circle", + parent=m.name, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis2, + yaxis=yaxis2, ) fig.update_layout( diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index f32954740..69f5fcd25 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -165,44 +165,40 @@ def plot_bootstrap( for met in metric_c: if any(m._bootstrap is None for m in models_c): - fig.add_trace( - go.Bar( - x=[m._best_score(met) for m in models_c], - y=[m.name for m in models_c], - error_x={ - "type": "data", - "array": [ - 0 if m._bootstrap is None else m.bootstrap.loc[:, met].std() - for m in models_c - ], - }, - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(met)}, - }, - hovertemplate="%{x}", - name=met, - legendgroup=met, - showlegend=BasePlot._fig.showlegend(met, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=[m._best_score(met) for m in models_c], + y=[m.name for m in models_c], + error_x={ + "type": "data", + "array": [ + 0 if m._bootstrap is None else m.bootstrap.loc[:, met].std() + for m in models_c + ], + }, + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(met)}, + }, + hovertemplate="%{x}", + name=met, + legendgroup=met, + showlegend=BasePlot._fig.showlegend(met, legend), + xaxis=xaxis, + yaxis=yaxis, ) else: - fig.add_trace( - go.Box( - x=np.ravel([m.bootstrap.loc[:, met] for m in models_c]), - y=np.ravel([[m.name] * len(m.bootstrap) for m in models_c]), - marker_color=BasePlot._fig.get_elem(met), - boxpoints="outliers", - orientation="h", - name=met, - legendgroup=met, - showlegend=BasePlot._fig.showlegend(met, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_box( + x=np.ravel([m.bootstrap.loc[:, met] for m in models_c]), + y=np.ravel([[m.name] * len(m.bootstrap) for m in models_c]), + marker_color=BasePlot._fig.get_elem(met), + boxpoints="outliers", + orientation="h", + name=met, + legendgroup=met, + showlegend=BasePlot._fig.showlegend(met, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -347,34 +343,30 @@ def plot_calibration( # Get calibration (frac of positives and predicted values) frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins) - fig.add_trace( - self._draw_line( - x=pred, - y=frac_pos, - parent=m.name, - child=child, - mode="lines+markers", - marker_symbol="circle", - legend=legend, - xaxis=xaxis2, - yaxis=yaxis, - ) + self._draw_line( + x=pred, + y=frac_pos, + parent=m.name, + child=child, + mode="lines+markers", + marker_symbol="circle", + legend=legend, + xaxis=xaxis2, + yaxis=yaxis, ) - fig.add_trace( - go.Histogram( - x=y_pred, - xbins={"start": 0, "end": 1, "size": 1.0 / n_bins}, - marker={ - "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, - }, - name=m.name, - legendgroup=m.name, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis2, - ) + fig.add_histogram( + x=y_pred, + xbins={"start": 0, "end": 1, "size": 1.0 / n_bins}, + marker={ + "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, + }, + name=m.name, + legendgroup=m.name, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis2, ) self._draw_straight_line((pred, frac_pos), y="diagonal", xaxis=xaxis2, yaxis=yaxis) @@ -544,27 +536,25 @@ def plot_confusion_matrix( target_c, np.unique(m.branch.dataset[target_c]).astype(str) ) - fig.add_trace( - go.Heatmap( - x=ticks, - y=ticks, - z=100.0 * cm / cm.sum(axis=1)[:, np.newaxis], - coloraxis=f"coloraxis{xaxis[1:]}", - text=cm, - customdata=labels, - texttemplate="%{text}
(%{z:.2f}%)", - textfont={"size": self.label_fontsize}, - hovertemplate=( - "%{customdata}" - if self.task.is_binary - else "" - "Predicted label:%{x}
True label:%{y}
Percentage:%{z}" - "" - ), - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_heatmap( + x=ticks, + y=ticks, + z=100.0 * cm / cm.sum(axis=1)[:, np.newaxis], + coloraxis=f"coloraxis{xaxis[1:]}", + text=cm, + customdata=labels, + texttemplate="%{text}
(%{z:.2f}%)", + textfont={"size": self.label_fontsize}, + hovertemplate=( + "%{customdata}" + if self.task.is_binary + else "" + "Predicted label:%{x}
True label:%{y}
Percentage:%{z}" + "" + ), + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -577,22 +567,20 @@ def plot_confusion_matrix( ) else: - fig.add_trace( - go.Bar( - x=cm.ravel(), - y=labels.ravel(), - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, - }, - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=cm.ravel(), + y=labels.ravel(), + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, + }, + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout(bargroupgap=0.05) @@ -709,17 +697,15 @@ def plot_det( *m._get_pred(ds, target, method=("decision_function", "predict_proba")) ) - fig.add_trace( - self._draw_line( - x=fpr, - y=fnr, - mode="lines", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=fpr, + y=fnr, + mode="lines", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) @@ -833,17 +819,15 @@ def plot_errors( for child, ds in self._get_set(rows): y_true, y_pred = m._get_pred(ds, target) - fig.add_trace( - self._draw_line( - x=y_true, - y=y_pred, - mode="markers", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=y_true, + y=y_pred, + mode="markers", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) # Fit the points using linear regression @@ -852,17 +836,15 @@ def plot_errors( model = OrdinaryLeastSquares(goal=self._goal) estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred) - fig.add_trace( - self._draw_line( - x=(x := np.linspace(y_true.min(), y_true.max(), 100)), - y=estimator.predict(x[:, np.newaxis]), - mode="lines", - hovertemplate="(%{x}, %{y})", - parent=m.name, - legend=None, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=(x := np.linspace(y_true.min(), y_true.max(), 100)), + y=estimator.predict(x[:, np.newaxis]), + mode="lines", + hovertemplate="(%{x}, %{y})", + parent=m.name, + legend=None, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((y_true, y_pred), y="diagonal", xaxis=xaxis, yaxis=yaxis) @@ -972,17 +954,15 @@ def plot_evals( ) for ds in dataset.split("+"): - fig.add_trace( - self._draw_line( - x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))), - y=m.evals[f"{self._metric[0].name}_{ds}"], - marker_symbol="circle", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))), + y=m.evals[f"{self._metric[0].name}_{ds}"], + marker_symbol="circle", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) @@ -1094,22 +1074,20 @@ def plot_feature_importance( "nor coef_ attribute." ) from None - fig.add_trace( - go.Bar( - x=fi, - y=fi.index, - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, - }, - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=fi, + y=fi.index, + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, + }, + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -1272,30 +1250,26 @@ def plot_forecast( y_true = m.branch._all.loc[y_pred.index, target_c] - fig.add_trace( - self._draw_line( - x=(x := self._get_plot_index(y_pred)), - y=y_pred, - mode="lines+markers", - parent=m.name, - legend=legend, - xaxis=xaxis2, - yaxis=yaxis, - ) + self._draw_line( + x=(x := self._get_plot_index(y_pred)), + y=y_pred, + mode="lines+markers", + parent=m.name, + legend=legend, + xaxis=xaxis2, + yaxis=yaxis, ) # Draw residuals - fig.add_trace( - self._draw_line( - x=x, - y=np.subtract(y_true, y_pred), - mode="lines+markers", - parent=m.name, - legend=legend, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis2, - ) + self._draw_line( + x=x, + y=np.subtract(y_true, y_pred), + mode="lines+markers", + parent=m.name, + legend=legend, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis2, ) if plot_interval: @@ -1343,17 +1317,15 @@ def plot_forecast( ) # Draw original time series - fig.add_trace( - go.Scatter( - x=x, - y=y_true, - mode="lines+markers", - line={"width": 1, "color": "black", "dash": "dash"}, - opacity=0.6, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis, - ) + fig.add_scatter( + x=x, + y=y_true, + mode="lines+markers", + line={"width": 1, "color": "black", "dash": "dash"}, + opacity=0.6, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis, ) # Draw horizontal reference line for residuals @@ -1478,17 +1450,15 @@ def plot_gains( ds, target, method=("decision_function", "predict_proba") ) - fig.add_trace( - self._draw_line( - x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), - y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()), - mode="lines", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), + y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()), + mode="lines", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis) @@ -1601,19 +1571,17 @@ def plot_learning_curve( std[m._group].append(m.bootstrap.loc[:, met].std()) for group in x: - fig.add_trace( - self._draw_line( - x=x[group], - y=y[group], - mode="lines+markers", - marker_symbol="circle", - error_y={"type": "data", "array": std[group], "visible": True}, - parent=group, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=x[group], + y=y[group], + mode="lines+markers", + marker_symbol="circle", + error_y={"type": "data", "array": std[group], "visible": True}, + parent=group, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) # Add error bands @@ -1759,18 +1727,15 @@ def plot_lift( ds, target, method=("decision_function", "predict_proba") ) - gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() - fig.add_trace( - self._draw_line( - x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), - y=(y := gains / x), - mode="lines", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), + y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() / x), + mode="lines", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((x, y), y=1, xaxis=xaxis, yaxis=yaxis) @@ -1943,31 +1908,29 @@ class is always the positive one. else: color = BasePlot._fig.get_elem("parshap") - fig.add_trace( - go.Scatter( - x=(x := parshap["train"]), - y=(y := parshap["test"]), - mode="markers+text", - marker={ - "color": color, - "size": self.marker_size, - "coloraxis": f"coloraxis{xaxis[1:]}", - "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, - }, - text=m.branch.features, - textposition="top center", - customdata=(data := None if isinstance(color, str) else list(color)), - hovertemplate=( - f"%{{text}}
(%{{x}}, %{{y}})" - f"{'
Feature importance: %{customdata:.4f}' if data else ''}" - f"{m.name}" - ), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_scatter( + x=(x := parshap["train"]), + y=(y := parshap["test"]), + mode="markers+text", + marker={ + "color": color, + "size": self.marker_size, + "coloraxis": f"coloraxis{xaxis[1:]}", + "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, + }, + text=m.branch.features, + textposition="top center", + customdata=(data := None if isinstance(color, str) else list(color)), + hovertemplate=( + f"%{{text}}
(%{{x}}, %{{y}})" + f"{'
Feature importance: %{customdata:.4f}' if data else ''}" + f"{m.name}" + ), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis) @@ -2198,18 +2161,16 @@ def plot_partial_dependence( # Draw the mean of the individual lines if "average" in kind: - fig.add_trace( - go.Scatter( - x=pred["values"][0], - y=pred["average"][target_c].ravel(), - mode="lines", - line={"width": 2, "color": color}, - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=ax[0], - yaxis=axes[0][1], - ) + fig.add_scatter( + x=pred["values"][0], + y=pred["average"][target_c].ravel(), + mode="lines", + line={"width": 2, "color": color}, + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=ax[0], + yaxis=axes[0][1], ) # Draw all individual (per sample) lines (ICE) @@ -2221,42 +2182,38 @@ def plot_partial_dependence( replace=False, ) for sample in pred["individual"][target_c, idx, :]: - fig.add_trace( - go.Scatter( - x=pred["values"][0], - y=sample, - mode="lines", - line={"width": 0.5, "color": color}, - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=ax[0], - yaxis=axes[0][1], - ) + fig.add_scatter( + x=pred["values"][0], + y=sample, + mode="lines", + line={"width": 0.5, "color": color}, + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=ax[0], + yaxis=axes[0][1], ) else: colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal") - fig.add_trace( - go.Contour( - x=pred["values"][0], - y=pred["values"][1], - z=pred["average"][target_c], - contours={ - "showlabels": True, - "labelfont": { - "size": self.tick_fontsize, - "color": "white", - }, + fig.add_contour( + x=pred["values"][0], + y=pred["values"][1], + z=pred["average"][target_c], + contours={ + "showlabels": True, + "labelfont": { + "size": self.tick_fontsize, + "color": "white", }, - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - hoverongaps=False, - colorscale=colorscale, - showscale=False, - showlegend=False, - xaxis=ax[0], - yaxis=axes[0][1], - ) + }, + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + hoverongaps=False, + colorscale=colorscale, + showscale=False, + showlegend=False, + xaxis=ax[0], + yaxis=axes[0][1], ) self._plot( @@ -2378,19 +2335,17 @@ def plot_permutation_importance( random_state=self.random_state, ) - fig.add_trace( - go.Box( - x=permutations["importances"].ravel(), - y=list(np.ravel([[fx] * n_repeats for fx in m.branch.features])), - marker_color=BasePlot._fig.get_elem(m.name), - boxpoints="outliers", - orientation="h", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_box( + x=permutations["importances"].ravel(), + y=list(np.ravel([[fx] * n_repeats for fx in m.branch.features])), + marker_color=BasePlot._fig.get_elem(m.name), + boxpoints="outliers", + orientation="h", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -2830,17 +2785,15 @@ def plot_prc( # Get precision-recall pairs for different thresholds prec, rec, _ = precision_recall_curve(y_true, y_pred) - fig.add_trace( - self._draw_line( - x=rec, - y=prec, - mode="lines", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=rec, + y=prec, + mode="lines", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line( @@ -2971,29 +2924,27 @@ def plot_probabilities( else: hist = y_pred.loc[y_true == v, str(cls)] - fig.add_trace( - go.Scatter( - x=(x := np.linspace(0, 1, 100)), - y=stats.gaussian_kde(hist)(x), - mode="lines", - line={ - "width": 2, - "color": BasePlot._fig.get_elem(m.name), - "dash": BasePlot._fig.get_elem(str(v), "dash"), - }, - fill="tonexty", - fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", - fillpattern={"shape": BasePlot._fig.get_elem(str(v), "shape")}, - name=f"{col}={v}", - legendgroup=m.name, - legendgrouptitle={ - "text": m.name, - "font_size": self.label_fontsize, - }, - showlegend=BasePlot._fig.showlegend(f"{m.name}-{v}", legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_sactter( + x=(x := np.linspace(0, 1, 100)), + y=stats.gaussian_kde(hist)(x), + mode="lines", + line={ + "width": 2, + "color": BasePlot._fig.get_elem(m.name), + "dash": BasePlot._fig.get_elem(str(v), "dash"), + }, + fill="tonexty", + fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", + fillpattern={"shape": BasePlot._fig.get_elem(str(v), "shape")}, + name=f"{col}={v}", + legendgroup=m.name, + legendgrouptitle={ + "text": m.name, + "font_size": self.label_fontsize, + }, + showlegend=BasePlot._fig.showlegend(f"{m.name}-{v}", legend), + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) @@ -3112,33 +3063,29 @@ def plot_residuals( for child, ds in self._get_set(rows): y_true, y_pred = m._get_pred(ds, target) - fig.add_trace( - self._draw_line( - x=y_true, - y=(res := np.subtract(y_true, y_pred)), - mode="markers", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=y_true, + y=(res := np.subtract(y_true, y_pred)), + mode="markers", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) - fig.add_trace( - go.Histogram( - y=res, - bingroup="residuals", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, - }, - name=m.name, - legendgroup=m.name, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis, - ) + fig.add_histogram( + y=res, + bingroup="residuals", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)}, + }, + name=m.name, + legendgroup=m.name, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis, ) self._draw_straight_line((y_true, res), y=0, xaxis=xaxis, yaxis=yaxis) @@ -3287,40 +3234,36 @@ def plot_results( f"can't be mixed with non-time metrics, got {metric_c}." ) - fig.add_trace( - go.Bar( - x=[m.results[met] for m in models_c], - y=[m.name for m in models_c], - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(met)}, - }, - hovertemplate=f"%{{x}}{met}", - name=met, - legendgroup=met, - showlegend=BasePlot._fig.showlegend(met, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=[m.results[met] for m in models_c], + y=[m.name for m in models_c], + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(met)}, + }, + hovertemplate=f"%{{x}}{met}", + name=met, + legendgroup=met, + showlegend=BasePlot._fig.showlegend(met, legend), + xaxis=xaxis, + yaxis=yaxis, ) else: - fig.add_trace( - go.Bar( - x=[m._get_score(met, rows) for m in models_c], - y=[m.name for m in models_c], - orientation="h", - marker={ - "color": f"rgba({BasePlot._fig.get_elem(met.name)[4:-1]}, 0.2)", - "line": {"width": 2, "color": BasePlot._fig.get_elem(met.name)}, - }, - hovertemplate="%{x}", - name=met.name, - legendgroup=met.name, - showlegend=BasePlot._fig.showlegend(met, legend), - xaxis=xaxis, - yaxis=yaxis, - ) + fig.add_bar( + x=[m._get_score(met, rows) for m in models_c], + y=[m.name for m in models_c], + orientation="h", + marker={ + "color": f"rgba({BasePlot._fig.get_elem(met.name)[4:-1]}, 0.2)", + "line": {"width": 2, "color": BasePlot._fig.get_elem(met.name)}, + }, + hovertemplate="%{x}", + name=met.name, + legendgroup=met.name, + showlegend=BasePlot._fig.showlegend(met, legend), + xaxis=xaxis, + yaxis=yaxis, ) fig.update_layout( @@ -3441,17 +3384,15 @@ def plot_roc( *m._get_pred(ds, target, method=("decision_function", "predict_proba")) ) - fig.add_trace( - self._draw_line( - x=fpr, - y=tpr, - mode="lines", - parent=m.name, - child=child, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=fpr, + y=tpr, + mode="lines", + parent=m.name, + child=child, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) self._draw_straight_line((fpr, tpr), y="diagonal", xaxis=xaxis, yaxis=yaxis) @@ -3564,24 +3505,21 @@ def plot_successive_halving( std[m._group].append(m.bootstrap.loc[:, met].std()) for group in x: - fig.add_trace( - self._draw_line( - x=x[group], - y=y[group], - mode="lines+markers", - marker_symbol="circle", - error_y={"type": "data", "array": std[group], "visible": True}, - parent=group, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=x[group], + y=y[group], + mode="lines+markers", + marker_symbol="circle", + error_y={"type": "data", "array": std[group], "visible": True}, + parent=group, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) # Add error bands if m.bootstrap is not None: - fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)" fig.add_traces( [ go.Scatter( @@ -3601,7 +3539,7 @@ def plot_successive_halving( mode="lines", line={"width": 1, "color": BasePlot._fig.get_elem(group)}, fill="tonexty", - fillcolor=fillcolor, + fillcolor=f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)", hovertemplate="%{y}lower bound", legendgroup=group, showlegend=False, @@ -3743,16 +3681,14 @@ def plot_threshold( for m in models_c: y_true, y_pred = m._get_pred(rows, target, method="predict_proba") for met in metric_c: - fig.add_trace( - self._draw_line( - x=(x := np.linspace(0, 1, steps)), - y=[met(y_true, y_pred >= step) for step in x], - parent=m.name, - child=met.__name__, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) + self._draw_line( + x=(x := np.linspace(0, 1, steps)), + y=[met(y_true, y_pred >= step) for step in x], + parent=m.name, + child=met.__name__, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, ) BasePlot._fig.used_models.extend(models_c) From 2ae20b4f5e894ab24ceb74ac219b86c5c20850d7 Mon Sep 17 00:00:00 2001 From: Marco van den Boom Date: Thu, 25 Jan 2024 15:54:55 +0100 Subject: [PATCH 2/2] added acf, pacf and decomposition plots --- atom/atom.py | 18 +- atom/data_cleaning.py | 7 +- atom/plots/basefigure.py | 2 +- atom/plots/baseplot.py | 9 +- atom/plots/dataplot.py | 487 +++++++++++++++++-- atom/plots/hyperparametertuningplot.py | 4 +- atom/plots/predictionplot.py | 35 +- atom/plots/shapplot.py | 11 +- atom/utils/types.py | 15 + docs/API/ATOM/atomclassifier/index.html | 6 +- docs/API/ATOM/atomforecaster/index.html | 6 +- docs/API/ATOM/atomregressor/index.html | 6 +- docs/API/plots/plot_parshap/index.html | 2 +- docs/search/search_index.json | 2 +- docs_sources/api/plots/plot_acf.md | 16 + docs_sources/api/plots/plot_decomposition.md | 16 + docs_sources/api/plots/plot_pacf.md | 16 + docs_sources/changelog/v5.x.x.md | 59 --- docs_sources/changelog/v6.x.x.md | 59 +++ docs_sources/dependencies.md | 2 +- mkdocs.yml | 3 + pyproject.toml | 2 +- tests/test_plots.py | 12 + 23 files changed, 632 insertions(+), 163 deletions(-) create mode 100644 docs_sources/api/plots/plot_acf.md create mode 100644 docs_sources/api/plots/plot_decomposition.md create mode 100644 docs_sources/api/plots/plot_pacf.md create mode 100644 docs_sources/changelog/v6.x.x.md diff --git a/atom/atom.py b/atom/atom.py index e6e2fdc89..2095c9c74 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -56,9 +56,9 @@ FloatZeroToOneInc, Index, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats, Operators, Pandas, Predictor, - PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, - Series, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, - XSelector, YSelector, sequence_t, + PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, + SeasonalityMode, Sequence, Series, TargetSelector, Transformer, + VectorizerStarts, Verbose, Warnings, XSelector, YSelector, sequence_t, ) from atom.utils.utils import ( ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk, @@ -922,7 +922,7 @@ def shrink( Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. - columns: int, str, segment, sequence or None, default=None + columns: int, str, segment, sequence, dataframe or None, default=None [Selection of columns][row-and-column-selection] to shrink. If None, transform all columns. @@ -1201,7 +1201,7 @@ def _add_transformer( has the `n_jobs` and/or `random_state` parameters, it adopts atom's values. - columns: int, str, segment, sequence or None, default=None + columns: int, str, segment, sequence, dataframe or None, default=None Columns in the dataset to transform. If None, transform all features. @@ -1388,7 +1388,7 @@ def add( instance), and it has the `n_jobs` and/or `random_state` parameters, it adopts atom's values. - columns: int, str, segment, sequence or None, default=None + columns: int, str, segment, sequence, dataframe or None, default=None [Selection of columns][row-and-column-selection] to transform. Only select features or the target column, not both at the same time (if that happens, the target column @@ -1564,7 +1564,7 @@ def decompose( self, *, model: str | Predictor | None = None, - mode: Literal["additive", "multiplicative"] = "additive", + mode: SeasonalityMode = "additive", **kwargs, ): """Detrend and deseasonalize the time series. @@ -1584,9 +1584,7 @@ def decompose( * Use the `columns` parameter to only decompose the target column, e.g., `atom.decompose(columns=atom.target)`. * Use the [plot_decomposition][] method to visualize the - trend, seasonality and residuals of the time series. This - can help to determine if the data follows an additive or - multiplicative trend. + trend, seasonality and residuals of the time series. """ columns = kwargs.pop("columns", None) diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index fb4452000..20c0886c6 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -47,8 +47,9 @@ Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine, Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor, - PrunerStrats, Scalar, ScalerStrats, Sequence, Series, Transformer, Verbose, - XSelector, YSelector, dataframe_t, sequence_t, series_t, + PrunerStrats, Scalar, ScalerStrats, SeasonalityMode, Sequence, Series, + Transformer, Verbose, XSelector, YSelector, dataframe_t, sequence_t, + series_t, ) from atom.utils.utils import ( Goal, bk, composed, crash, get_col_order, get_cols, it, lst, merge, @@ -1083,7 +1084,7 @@ def __init__( *, model: str | Predictor | None = None, sp: IntLargerZero | None = None, - mode: Literal["additive", "multiplicative"] = "additive", + mode: SeasonalityMode = "additive", n_jobs: NJobs = 1, verbose: Verbose = 0, logger: str | Path | Logger | None = None, diff --git a/atom/plots/basefigure.py b/atom/plots/basefigure.py index f77c973c2..e50d22bb2 100644 --- a/atom/plots/basefigure.py +++ b/atom/plots/basefigure.py @@ -190,7 +190,7 @@ def get_elem( else: return self.style[element].setdefault(name, next(getattr(self, element))) - def showlegend(self, name: str, legend: Legend | dict | None) -> bool: + def showlegend(self, name: str, legend: Legend | dict[str, Any] | None) -> bool: """Get whether the trace should be showed in the legend. If there's already a trace with the same name, it's not diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index faa5fbb72..72cdf8774 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -408,14 +408,16 @@ def _draw_line( child: str or None, default=None Name of the secondary attribute. - legend: str, dict or None + legend: str, dict or None, default=None Legend argument provided by the user. **kwargs Additional keyword arguments for the trace. """ - Baseplot._fig.figure.add_scatter( + BasePlot._fig.figure.add_scatter( + name=kwargs.pop("name", child or parent), + mode=kwargs.pop("mode", "lines"), line=kwargs.pop( "line", { "width": self.line_width, @@ -435,7 +437,6 @@ def _draw_line( "hovertemplate", f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}", ), - name=kwargs.pop("name", child or parent), legendgroup=kwargs.pop("legendgroup", parent), legendgrouptitle=kwargs.pop( "legendgrouptitle", @@ -443,7 +444,7 @@ def _draw_line( ), showlegend=kwargs.pop( "showlegend", - BasePlot._fig.showlegend(f"{parent}-{child}", legend) + BasePlot._fig.showlegend(f"{parent}-{child}" if child else parent, legend) ), **kwargs, ) diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index de26e44b5..df5a6e175 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -10,10 +10,9 @@ from abc import ABCMeta from pathlib import Path from typing import Any, Literal -from statsmodels.tsa.stattools import pacf + import numpy as np import pandas as pd -from sklearn.utils.metaestimators import available_if import plotly.graph_objects as go from beartype import beartype from nltk.collocations import ( @@ -22,15 +21,19 @@ ) from scipy import stats from sklearn.base import is_classifier +from sklearn.utils.metaestimators import available_if +from statsmodels.tsa.seasonal import seasonal_decompose +from statsmodels.tsa.stattools import acf, pacf from atom.plots.baseplot import BasePlot from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, RowSelector, - Segment, Sequence, Series, + Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, PACFMethods, + RowSelector, SeasonalityMode, Segment, Sequence, Series, ) from atom.utils.utils import ( - check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, has_task + check_dependency, crash, divide, get_corpus, has_task, lst, + replace_missing, rnd, ) @@ -44,6 +47,170 @@ class DataPlot(BasePlot, metaclass=ABCMeta): """ + @available_if(has_task("forecast")) + @crash + def plot_acf( + self, + columns: ColumnSelector | None = None, + nlags: IntLargerZero | None = None, + *, + title: str | dict[str, Any] | None = None, + legend: Legend | dict[str, Any] | None = "upper right", + figsize: tuple[IntLargerZero, IntLargerZero] | None = None, + filename: str | Path | None = None, + display: Bool | None = True, + ) -> go.Figure | None: + """Plot the autocorrelation function. + + The autocorrelation function (ACF) measures the correlation + between a time series and lagged versions of itself. It's + useful, for example, to identify the order of an autoregressive + model. This plot is only available for [forecast][time-series] + tasks. + + Parameters + ---------- + columns: int, str, segment, sequence, dataframe or None, default=None + Columns to plot the pacf from. If None, it selects the + target column. + + nlags: int or None, default=None + Number of lags to return autocorrelation for. If None, it + uses `min(10 * np.log10(len(y)), len(y) // 2 - 1)`. The + returned value includes lag 0 (i.e., 1), so the size of the + vector is `(nlags + 1,)`. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of lags shown. + + filename: str, Path or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_acf + atom.plots:DataPlot.plot_decomposition + atom.plots:DataPlot.plot_ttf + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.plot_acf() + ``` + + """ + if columns is None: + columns_c = lst(self.branch.target) + else: + columns_c = self.branch._get_columns(columns) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + if nlags is None: + nlags = min(int(10 * np.log10(self.branch.shape[0])), self.branch.shape[0] // 2 - 1) + + for col in columns_c: + # Returns correlation array and confidence interval + corr, conf = acf(self.branch.dataset[col], nlags=nlags, alpha=0.05) + + for pos in (x := np.arange(len(corr))): + self._draw_line( + x=(pos, pos), + y=(0, corr[pos]), + parent=col, + hoverinfo="skip", + xaxis=xaxis, + yaxis=yaxis, + ) + + self._draw_line( + x=x, + y=corr, + parent=col, + mode="markers", + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + + fig.add_traces( + [ + go.Scatter( + x=x, + y=np.subtract(conf[:, 1], corr), + mode="lines", + line={"width": 1, "color": BasePlot._fig.get_elem(col)}, + hovertemplate="%{y}upper bound", + legendgroup=col, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=x, + y=np.subtract(conf[:, 0], corr), + mode="lines", + line={"width": 1, "color": BasePlot._fig.get_elem(col)}, + fill="tonexty", + fillcolor=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", + hovertemplate="%{y}lower bound", + legendgroup=col, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + ] + ) + + fig.update_yaxes(zerolinecolor="black") + fig.update_layout({"hovermode": "x unified"}) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Lag", + ylabel="Autocorrelation", + title=title, + legend=legend, + figsize=figsize or (700 + nlags * 10, 600), + plotname="plot_acf", + filename=filename, + display=display, + ) + @crash def plot_components( self, @@ -294,6 +461,178 @@ def plot_correlation( display=display, ) + @available_if(has_task("forecast")) + @crash + def plot_decomposition( + self, + columns: ColumnSelector | None = None, + mode: SeasonalityMode = "additive", + *, + title: str | dict[str, Any] | None = None, + legend: Legend | dict[str, Any] | None = "out", + figsize: tuple[IntLargerZero, IntLargerZero] = (900, 900), + filename: str | Path | None = None, + display: Bool | None = True, + ) -> go.Figure | None: + """Plot the trend, seasonality and residuals of a time series. + + This plot is only available for [forecast][time-series] tasks. + + !!! tip + Use atom's [decompose][atomforecaster-decompose] method to + remove trend and seasonality from the data. + + Parameters + ---------- + columns: int, str, segment, sequence or dataframe, default=-1 + [Selection of columns][row-and-column-selection] to plot. + If None, the target column is selected. + + mode: str, default="additive" + Mode of the decomposition. Choose from: + + - "additive": Assumes the components have a linear relation, + i.e., y(t) = level + trend + seasonality + noise. + - "multiplicative": Assumes the components have a nonlinear + relation, i.e., y(t) = level * trend * seasonality * noise. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="out" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 900) + Figure's size in pixels, format as (x, y). + + filename: str, Path or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_acf + atom.plots:DataPlot.plot_pacf + atom.plots:DataPlot.plot_series + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.plot_decomposition() + ``` + + """ + if columns is None: + columns_c = lst(self.branch.target) + else: + columns_c = self.branch._get_columns(columns) + + self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes(y=(0.76, 1.0)) + xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.51, 0.74)) + xaxis3, yaxis3 = BasePlot._fig.get_axes(y=(0.26, 0.49)) + xaxis4, yaxis4 = BasePlot._fig.get_axes(y=(0.0, 0.24)) + + # Returns correlation array and confidence interval + decompose = seasonal_decompose( + x=self.branch.dataset[columns_c], + model=mode, + period=self.sp, + ) + + for col in columns_c: + self._draw_line( + x=(x := self._get_plot_index(decompose.trend)), + y=decompose.observed, + parent=col, + child="observed", + legend=legend, + xaxis=xaxis4, + yaxis=yaxis, + ) + + self._draw_line( + x=x, + y=decompose.trend, + parent=col, + child="trend", + legend=legend, + xaxis=xaxis4, + yaxis=yaxis2, + ) + + self._draw_line( + x=x, + y=decompose.seasonal, + parent=col, + child="trend", + legend=legend, + xaxis=xaxis4, + yaxis=yaxis3, + ) + + self._draw_line( + x=x, + y=decompose.resid, + parent=col, + child="trend", + legend=legend, + xaxis=xaxis4, + yaxis=yaxis4, + ) + + self._plot( + ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), + ylabel="Values", + ) + + self._plot( + ax=(f"xaxis{xaxis3[1:]}", f"yaxis{yaxis3[1:]}"), + ylabel="Values", + ) + + self._plot( + ax=(f"xaxis{xaxis4[1:]}", f"yaxis{yaxis4[1:]}"), + ylabel="Values", + ) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel=self.branch.dataset.index.name or "index", + ylabel="Values", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_acf", + filename=filename, + display=display, + ) + @crash def plot_distribution( self, @@ -673,21 +1012,22 @@ def get_text(column: Series) -> Series: def plot_pacf( self, columns: ColumnSelector | None = None, - show: IntLargerZero | None = 10, + nlags: IntLargerZero | None = None, + method: PACFMethods = "ywadjusted", *, title: str | dict[str, Any] | None = None, - legend: Legend | dict[str, Any] | None = "lower right", + legend: Legend | dict[str, Any] | None = "upper right", figsize: tuple[IntLargerZero, IntLargerZero] | None = None, filename: str | Path | None = None, display: Bool | None = True, ) -> go.Figure | None: """Plot the partial autocorrelation function. - Missing values are ignored. - - !!! tip - Use atom's [decompose][atomforecaster-decompose] method to - remove trend and seasonality from the data. + The partial autocorrelation function (PACF) measures the + correlation between a time series and lagged versions of + itself. It's useful, for example, to identify the order of + an autoregressive model. This plot is only available for + [forecast][time-series] tasks. Parameters ---------- @@ -695,9 +1035,29 @@ def plot_pacf( Columns to plot the pacf from. If None, it selects the target column. - show: int or None, default=10 - Number of n-grams (ordered by number of occurrences) to - show in the plot. If none, show all n-grams (up to 200). + nlags: int or None, default=None + Number of lags to return autocorrelation for. If None, it + uses `min(10 * np.log10(len(y)), len(y) // 2 - 1)`. The + returned value includes lag 0 (i.e., 1), so the size of the + vector is `(nlags + 1,)`. + + method : str, default="ywadjusted" + Specifies which method to use for the calculations. + + - "yw" or "ywadjusted": Yule-Walker with sample-size + adjustment in denominator for acovf. + - "ywm" or "ywmle": Yule-Walker without adjustment. + - "ols" : Regression of time series on lags of it and on + constant. + - "ols-inefficient": Regression of time series on lags using + a single common sample to estimate all pacf coefficients. + - "ols-adjusted": Regression of time series on lags with a + bias adjustment. + - "ld" or "ldadjusted": Levinson-Durbin recursion with bias + correction. + - "ldb" or "ldbiased": Levinson-Durbin recursion without bias + correction. + - "burg": Burg"s partial autocorrelation estimator. title: str, dict or None, default=None Title for the plot. @@ -706,7 +1066,7 @@ def plot_pacf( - If str, text for the title. - If dict, [title configuration][parameters]. - legend: str, dict or None, default="lower right" + legend: str, dict or None, default="upper right" Legend for the plot. See the [user guide][parameters] for an extended description of the choices. @@ -716,7 +1076,7 @@ def plot_pacf( figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of n-grams shown. + adapts the size to the number of lags shown. filename: str, Path or None, default=None Save the plot using this name. Use "auto" for automatic @@ -755,39 +1115,76 @@ def plot_pacf( columns_c = lst(self.branch.target) else: columns_c = self.branch._get_columns(columns) - show_c = self._get_show(show) fig = self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() + if nlags is None: + nlags = min(int(10 * np.log10(self.branch.shape[0])), self.branch.shape[0] // 2 - 1) + for col in columns_c: - corr_array = pacf(self.branch.dataset[col].dropna(), nlags=10, alpha=0.05) + # Returns correlation array and confidence interval + corr, conf = pacf(self.branch.dataset[col], nlags=nlags, method=method, alpha=0.05) - lower_y = corr_array[1][:, 0] - corr_array[0] - upper_y = corr_array[1][:, 1] - corr_array[0] + for pos in (x := np.arange(len(corr))): + self._draw_line( + x=(pos, pos), + y=(0, corr[pos]), + parent=col, + hoverinfo="skip", + xaxis=xaxis, + yaxis=yaxis, + ) - for x in range(len(corr_array[0])): - fig.add_scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', line_color='#3f3f3f', xaxis=xaxis, yaxis=yaxis) + self._draw_line( + x=x, + y=corr, + parent=col, + mode="markers", + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) - fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', - marker_color='#1f77b4', - marker_size=12, xaxis=xaxis, yaxis=yaxis) - fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', - line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis) - fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines', - fillcolor='rgba(32, 146, 230,0.3)', - fill='tonexty', line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis) + fig.add_traces( + [ + go.Scatter( + x=x, + y=np.subtract(conf[:, 1], corr), + mode="lines", + line={"width": 1, "color": BasePlot._fig.get_elem(col)}, + hovertemplate="%{y}upper bound", + legendgroup=col, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=x, + y=np.subtract(conf[:, 0], corr), + mode="lines", + line={"width": 1, "color": BasePlot._fig.get_elem(col)}, + fill="tonexty", + fillcolor=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", + hovertemplate="%{y}lower bound", + legendgroup=col, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + ] + ) - fig.update_traces(showlegend=False) - # fig.update_xaxes(range=[-1, 42]) - fig.update_yaxes(zerolinecolor="black") + fig.update_yaxes(zerolinecolor="black") + fig.update_layout({"hovermode": "x unified"}) return self._plot( ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), xlabel="Lag", + ylabel="Partial autocorrelation", title=title, legend=legend, - figsize=figsize or (900, 400 + show_c * 50), + figsize=figsize or (700 + nlags * 10, 600), plotname="plot_pacf", filename=filename, display=display, @@ -994,7 +1391,7 @@ def plot_qq( """ columns_c = self.branch._get_columns(columns) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() percentiles = np.linspace(0, 100, 101) @@ -1285,20 +1682,18 @@ def plot_rfecv( mean = self.rfecv_.cv_results_["mean_test_score"] std = self.rfecv_.cv_results_["std_test_score"] - fig.add_scatter( + self._draw_line( x=list(x), y=mean, + parent="rfecv", + name=ylabel, mode="lines+markers", - line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")}, marker={ "symbol": symbols, "size": sizes, "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, "opacity": 1, }, - name=ylabel, - legendgroup="rfecv", - showlegend=BasePlot._fig.showlegend("rfecv", legend), xaxis=xaxis, yaxis=yaxis, ) @@ -1350,6 +1745,7 @@ def plot_rfecv( display=display, ) + @available_if(has_task("forecast")) @crash def plot_series( self, @@ -1364,8 +1760,7 @@ def plot_series( ) -> go.Figure | None: """Plot a data series. - This plot is specially useful to plot the time series for - [forecast][time-series] tasks. + This plot is only available for [forecast][time-series] tasks. Parameters ---------- @@ -1437,7 +1832,7 @@ def plot_series( else: columns_c = self.branch._get_columns(columns, include_target=True) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for col in columns_c: @@ -1445,14 +1840,14 @@ def plot_series( self._draw_line( x=self._get_plot_index(y := self.branch._get_rows(ds)[col]), y=y, + parent=col, + child=child, mode="lines+markers", marker={ "size": self.marker_size, "color": BasePlot._fig.get_elem(col), "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"}, }, - parent=col, - child=child, legend=legend, xaxis=xaxis, yaxis=yaxis, diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py index e520a6e03..5b808badc 100644 --- a/atom/plots/hyperparametertuningplot.py +++ b/atom/plots/hyperparametertuningplot.py @@ -248,7 +248,7 @@ def plot_edf( x_max = bk.concat([m.trials[metric_c] for m in models_c]).max(axis=None) x = np.linspace(x_min, x_max, 100) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -1178,7 +1178,7 @@ def plot_terminator_improvement( models_c = self._get_plot_models(models, ensembles=False) models_c = self._check_hyperparams(models_c) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index 69f5fcd25..af0a13b34 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -687,7 +687,7 @@ def plot_det( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -700,7 +700,6 @@ def plot_det( self._draw_line( x=fpr, y=fnr, - mode="lines", parent=m.name, child=child, legend=legend, @@ -812,7 +811,7 @@ def plot_errors( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -839,7 +838,6 @@ def plot_errors( self._draw_line( x=(x := np.linspace(y_true.min(), y_true.max(), 100)), y=estimator.predict(x[:, np.newaxis]), - mode="lines", hovertemplate="(%{x}, %{y})", parent=m.name, legend=None, @@ -943,7 +941,7 @@ def plot_evals( """ models_c = self._get_plot_models(models, ensembles=False) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -1441,7 +1439,7 @@ def plot_gains( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -1453,7 +1451,6 @@ def plot_gains( self._draw_line( x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()), - mode="lines", parent=m.name, child=child, legend=legend, @@ -1718,7 +1715,7 @@ def plot_lift( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -1730,7 +1727,6 @@ def plot_lift( self._draw_line( x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() / x), - mode="lines", parent=m.name, child=child, legend=legend, @@ -1785,8 +1781,9 @@ def plot_parshap( models: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. - columns: int, str, segment, sequence or None, default=None - XSelector to plot. If None, it plots all features. + columns: int, str, segment, sequence, dataframe or None, default=None + [Feature set][row-and-column-selection] to plot. If None, + it selects all features. target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, @@ -1984,9 +1981,9 @@ def plot_partial_dependence( models: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. - columns: int, str, segment, sequence, dataframe, default=(0, 1, 2) - [XSelector][row-and-column-selection] to get the partial - dependence from. + columns: int, str, segment, sequence or dataframe, default=(0, 1, 2) + [Feature set][row-and-column-selection] to get the + partial dependence from. kind: str or sequence, default="average" Kind of dependence to plot. Use a sequence or add `+` between @@ -2773,7 +2770,7 @@ def plot_prc( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -2788,7 +2785,6 @@ def plot_prc( self._draw_line( x=rec, y=prec, - mode="lines", parent=m.name, child=child, legend=legend, @@ -2924,7 +2920,7 @@ def plot_probabilities( else: hist = y_pred.loc[y_true == v, str(cls)] - fig.add_sactter( + fig.add_scatter( x=(x := np.linspace(0, 1, 100)), y=stats.gaussian_kde(hist)(x), mode="lines", @@ -3374,7 +3370,7 @@ def plot_roc( """ models_c = self._get_plot_models(models) - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: @@ -3387,7 +3383,6 @@ def plot_roc( self._draw_line( x=fpr, y=tpr, - mode="lines", parent=m.name, child=child, legend=legend, @@ -3675,7 +3670,7 @@ def plot_threshold( metric_c.append(m) metric_c = [get_custom_scorer(m)._score_func for m in metric_c] - fig = self._get_figure() + self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes() for m in models_c: diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py index 0242f8788..7c5bc10d3 100644 --- a/atom/plots/shapplot.py +++ b/atom/plots/shapplot.py @@ -20,8 +20,8 @@ from atom.plots.baseplot import BasePlot from atom.utils.types import ( - Bool, Int, IntLargerZero, Legend, ModelSelector, RowSelector, - TargetsSelector, + Bool, ColumnSelector, Int, IntLargerZero, Legend, ModelSelector, + RowSelector, TargetsSelector, ) from atom.utils.utils import check_canvas, crash, has_task @@ -645,7 +645,7 @@ def plot_shap_scatter( self, models: ModelSelector | None = None, rows: RowSelector = "test", - columns: Int | str = 0, + columns: ColumnSelector = 0, target: TargetsSelector = 1, *, title: str | dict[str, Any] | None = None, @@ -676,8 +676,9 @@ def plot_shap_scatter( plot_shap_scatter method does not support plotting a single sample. - columns: int or str, default=0 - Column to plot. + columns: int, str, segment, sequence or dataframe, default=0 + [Feature][row-and-column-selection] to plot. Only one + column can be selected. target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, diff --git a/atom/utils/types.py b/atom/utils/types.py index d1075aa27..70bcd49f3 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -284,8 +284,23 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Others Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None +SeasonalityMode: TypeAlias = Literal["additive", "multiplicative"] HarmonicsSelector: TypeAlias = Literal["drop", "raw_strength", "harmonic_strength"] Stages: TypeAlias = Literal["None", "Staging", "Production", "Archived"] +PACFMethods: TypeAlias = Literal[ + "yw", + "ywadjusted", + "ywm", + "ywmle", + "ols", + "ols-inefficient", + "ols-adjusted", + "ld", + "ldadjusted", + "ldb", + "ldbiased", + "burg", +] NItems: TypeAlias = ( IntLargerEqualZero | dict[str, IntLargerEqualZero] diff --git a/docs/API/ATOM/atomclassifier/index.html b/docs/API/ATOM/atomclassifier/index.html index d4056742b..734fe884f 100644 --- a/docs/API/ATOM/atomclassifier/index.html +++ b/docs/API/ATOM/atomclassifier/index.html @@ -5177,7 +5177,7 @@

Utility methods

Parameterstransformer: Transformer

Estimator to add to the pipeline. Should implement a transform method.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column @@ -5323,7 +5323,7 @@

Utility methods

Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame
@@ -5602,7 +5602,7 @@

Utility methods

dense2sparse: bool, default=False
Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to shrink. If None, transform all columns.

diff --git a/docs/API/ATOM/atomforecaster/index.html b/docs/API/ATOM/atomforecaster/index.html index 56ffb019c..4d64fb98b 100644 --- a/docs/API/ATOM/atomforecaster/index.html +++ b/docs/API/ATOM/atomforecaster/index.html @@ -5133,7 +5133,7 @@

Utility methods

Parameterstransformer: Transformer

Estimator to add to the pipeline. Should implement a transform method.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column @@ -5279,7 +5279,7 @@

Utility methods

Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame
@@ -5558,7 +5558,7 @@

Utility methods

dense2sparse: bool, default=False
Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to shrink. If None, transform all columns.

diff --git a/docs/API/ATOM/atomregressor/index.html b/docs/API/ATOM/atomregressor/index.html index c524785d9..14369d84b 100644 --- a/docs/API/ATOM/atomregressor/index.html +++ b/docs/API/ATOM/atomregressor/index.html @@ -5154,7 +5154,7 @@

Utility methods

Parameterstransformer: Transformer

Estimator to add to the pipeline. Should implement a transform method.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column @@ -5300,7 +5300,7 @@

Utility methods

Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame
@@ -5579,7 +5579,7 @@

Utility methods

dense2sparse: bool, default=False
Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
Selection of columns to shrink. If None, transform all columns.

diff --git a/docs/API/plots/plot_parshap/index.html b/docs/API/plots/plot_parshap/index.html index 726e50470..879472e61 100644 --- a/docs/API/plots/plot_parshap/index.html +++ b/docs/API/plots/plot_parshap/index.html @@ -4543,7 +4543,7 @@

plot_parshap

Parametersmodels: int, str, Model, segment, sequence or None, default=None
Models to plot. If None, all models are selected.

-

columns: int, str, segment, sequence or None, default=None
+

columns: int, str, segment, sequence, dataframe or None, default=None
XSelector to plot. If None, it plots all features.

target: int, str or tuple, default=1
Class in the target column to target. For multioutput tasks, diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 42381e4ba..4fd8e276d 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "about/", "title": "About", "text": ""}, {"location": "about/#what-is-it", "title": "What is it?", "text": "

Automated Tool for Optimized Modeling (ATOM) is an open-source Python package designed to help data scientists fasten up the exploration phase of their machine learning projects. ATOM is a low-code, easy-to-use library, capable of running experiments quickly and efficiently, enabling the user to go from raw data to generating insights in just a few lines of code. Click here to get started.

"}, {"location": "about/#what-can-i-do-with-it", "title": "What can I do with it?", "text": "

ATOM is an end-to-end solution for machine learning pipelines. It supports the user from raw data ingestion to the final results' analysis and model deployment. Click on the icons to read more about its main functionalities.

Data cleaning Feature engineering Model selection Hyperparametertuning Model training Model predictions Experiment logging Analysis &Interpretability"}, {"location": "about/#who-is-it-intended-for", "title": "Who is it intended for?", "text": "
  • Data scientists that want to fasten up the exploration phase of their machine learning projects.
  • Data scientists that want to run a simple modeling experiment without having to spend too much time on coding.
  • Data scientists that are new to Python and are not (yet) familiar with all the relevant machine learning packages.
  • Data analysts without extensive knowledge of machine learning that want to try out model-based solutions.
  • Anyone who wants to rapidly build a Proof of Concept, for example during a hackathon.
  • Anyone who is new to the field of machine learning and wants a low-code, easy to learn package, to get started building predictive pipelines.
"}, {"location": "about/#citing-atom", "title": "Citing ATOM", "text": "

If you use ATOM in a scientific publication, please consider citing this documentation page as the resource. ATOM\u2019s first stable release v2.0.3 was made publicly available in November 2019. A formatted version of the citation would look like this:

ATOM v2.0.3, November 2019. URL https://tvdboom.github.io/ATOM/

BibTeX entry:

@Manual{ATOM,\n    title = {ATOM: A Python package for fast exploration of machine learning pipelines},\n    author = {Mavs},\n    year={2019},\n    mont={November},\n    note = {ATOM version 2.0.3},\n    url = {https://tvdboom.github.io/ATOM/},\n}\n

"}, {"location": "about/#support", "title": "Support", "text": "

ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.

"}, {"location": "about/#integrations", "title": "Integrations", "text": ""}, {"location": "contributing/", "title": "Contributing", "text": "

Are you interested in contributing to ATOM? Do you want to report a bug? Do you have a question? Before you do, please read the following guidelines.

"}, {"location": "contributing/#submission-context", "title": "Submission context", "text": ""}, {"location": "contributing/#question-or-problem", "title": "Question or problem?", "text": "

For quick questions, there's no need to open an issue. Check first if the question isn't already answered in the FAQ section. If not, reach us through the discussions page or on the slack channel.

"}, {"location": "contributing/#report-a-bug", "title": "Report a bug?", "text": "

If you found a bug in the source code, you can help by submitting an issue to the issue tracker in the GitHub repository. Even better, you can submit a Pull Request with a fix. However, before doing so, please read the submission guidelines.

"}, {"location": "contributing/#missing-a-feature", "title": "Missing a feature?", "text": "

You can request a new feature by submitting an issue to the GitHub Repository. If you would like to implement a new feature, please submit an issue with a proposal for your work first. Please consider what kind of change it is:

  • For a major feature, first open an issue and outline your proposal so that it can be discussed. This will also allow us to better coordinate our efforts, prevent duplication of work, and help you to craft the change so that it is successfully accepted into the project.

  • Small features and bugs can be crafted and directly submitted as a Pull Request. However, there is no guarantee that your feature will make it into master, as it's always a matter of opinion whether if benefits the overall functionality of the project.

"}, {"location": "contributing/#project-layout", "title": "Project layout", "text": "

The latest stable release of ATOM is on the master branch, whereas the latest version of ATOM in development is on the development branch. Make sure you are looking at and working on the correct branch if you're looking to contribute code.

In terms of directory structure:

  • All of ATOM's code sources are in the atom directory.
  • The documentation sources are in the docs_sources directory.
  • Images in the documentation are in the docs_sources/img directory.
  • Tutorial notebooks are in the examples directory. If you want to include the example to the documentation as well, add the .ipynb file to docs_sources/examples and update the mkdocs.yml file accordingly.
  • Unit tests are in the tests directory. Make sure to add the tests to the file corresponding to the module in the atom directory with the code that is being tested.

Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.

"}, {"location": "contributing/#submission-guidelines", "title": "Submission guidelines", "text": ""}, {"location": "contributing/#submitting-an-issue", "title": "Submitting an issue", "text": "

Before you submit an issue, please search the issue tracker, maybe an issue for your problem already exists, and the discussion might inform you of workarounds readily available.

We want to fix all the issues as soon as possible, but before fixing a bug we need to reproduce and confirm it. In order to reproduce bugs we will systematically ask you to provide a minimal reproduction scenario using the custom issue template.

"}, {"location": "contributing/#submitting-a-pull-request", "title": "Submitting a pull request", "text": "

Before you submit a pull request, please work through this checklist to make sure that you have done the necessary so we can efficiently review and accept your changes.

  • Update the documentation so all of your changes are reflected there.
  • Adhere to PEP 8 standards.
  • Use a maximum of 91 characters per line. Try to keep docstrings below 74 characters.
  • Update the project unit tests to test your code changes as thoroughly as possible.
  • Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices.
  • Run isort: isort atom tests.
  • Run flake8: flake8 --show-source --statistics atom tests.
  • Run pydocstyle: pydocstyle atom tests.
  • Run mypy: mypy atom tests.

If your contribution requires a new library dependency:

  • Double-check that the new dependency is easy to install via pip and Anaconda.
  • The library should support Python 3.10 and 3.11.
  • Make sure the code works with the latest version of the library.
  • Update the dependencies in the documentation.
  • Add the library with the minimum required version to pyproject.toml.

After submitting your pull request, GitHub will automatically run the tests on your changes and make sure that the updated code builds successfully. The checks run on Python 3.10 and 3.11, on Ubuntu and Windows. We also use services that automatically check code style and test coverage.

"}, {"location": "dependencies/", "title": "Dependencies", "text": ""}, {"location": "dependencies/#python-os", "title": "Python & OS", "text": "

As of the moment, ATOM supports the following Python versions:

  • Python 3.10
  • Python 3.11

And operating systems:

  • Linux (Ubuntu, Fedora, etc...)
  • Windows 8.1+
  • macOS (not tested)

"}, {"location": "dependencies/#packages", "title": "Packages", "text": ""}, {"location": "dependencies/#required", "title": "Required", "text": "

ATOM is built on top of several existing Python libraries. These packages are necessary for its correct functioning.

  • beartype (>=0.16.4)
  • category-encoders (>=2.6.3)
  • dagshub (>=0.3.8)
  • dill (>=0.3.6)
  • gplearn (>=0.4.2)
  • imbalanced-learn (>=0.11.0)
  • ipython (>=8.11.0)
  • ipywidgets (>=8.1.1)
  • featuretools (>=1.28.0)
  • joblib (>=1.3.1)
  • matplotlib (>=3.7.2)
  • mlflow (>=2.7.1)
  • modin[ray] (>=0.25.0)
  • nltk (>=3.8.1)
  • numpy (>=1.23.0)
  • optuna (>=3.4.0)
  • pandas[parquet] (>=2.1.2)
  • plotly (>=5.15.0)
  • ray[serve] (>=2.7.1)
  • scikit-learn (>=1.3.1)
  • scikit-learn-intelex (>=2023.2.1)
  • scipy (>=1.10.1)
  • shap (>=0.43.0)
  • sktime (>=0.24.0)
  • zoofs (>=0.1.26)
"}, {"location": "dependencies/#optional", "title": "Optional", "text": "

Some specific models, utility methods or plots require the installation of additional libraries. You can install all the optional dependencies using pip install atom-ml[full]. Doing so also installs the following libraries:

  • botorch (>=0.8.5)
  • catboost (>=1.2)
  • explainerdashboard (>=0.4.3)
  • gradio (>=3.44.4)
  • lightgbm (>=4.1.0)
  • pmdarima (>=2.0.3)
  • schemdraw (>=0.16)
  • sweetviz (>=2.3.1)
  • wordcloud (>=1.9.2)
  • xgboost (>=2.0.0)
"}, {"location": "dependencies/#development", "title": "Development", "text": "

The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to contribute to the project. Install them running pdm install --dev (don't forget to install pdm with pip install -U pdm).

Linting

  • isort (>=5.12.0)
  • flake8 (>=6.0.0)
  • flake8-pyproject (>=1.2.3)
  • pydocstyle (>=6.3.0)
  • mypy (>=1.6.1)
  • pandas_stubs (>=2.1.1.230928)
  • types-requests (>=2.31.0.10)

Testing

  • nbmake (>=1.4.1)
  • pytest (>=7.2.1)
  • pytest-cov (>=4.0.0)
  • pytest-xdist (>=3.2.0)
  • scikeras (>=0.11.0)
  • tensorflow (>=2.13.0)

Documentation

  • jupyter-contrib-nbextensions (>=0.7.0)
  • mike (>=1.1.2)
  • mkdocs (>=1.5.3)
  • mkdocs-autorefs (>=0.5.0)
  • mkdocs-jupyter (>=0.24.6)
  • mkdocs-material (>=9.4.7)
  • mkdocs-simple-hooks (>=0.1.5)
  • pymdown-extensions (>=10.3.1)
  • pyyaml (>=6.0)
"}, {"location": "faq/", "title": "Frequently asked questions", "text": "

Here we try to give answers to some questions that have popped up regularly. If you have any other questions, don't hesitate to create a new discussion or post them on the Slack channel!

??? faq Is this package related to the Atom text editor?\" There is, indeed, a text editor with the same name and a similar logo as this package. Is this a shameless copy? No. When I started the project, I didn't know about the text editor, and it doesn't require much thinking to come up with the idea of replacing the letter O of the word atom with the image of an atom.

How does ATOM relate to AutoML?

ATOM is not an AutoML tool since it does not automate the search for an optimal pipeline like well-known AutoML tools such as auto-sklearn or EvalML do. Instead, ATOM helps the user find the optimal pipeline himself. One of the goals of this package is to help data scientists produce explainable pipelines, and using an AutoML black box function would impede that.

Is it possible to run deep learning models?

Yes. Deep learning models can be added as custom models to the pipeline as long as they follow sklearn's API. For more information, see the deep learning section of the user guide.

Can I run atom's methods on just a subset of the columns?

Yes, all data cleaning and feature engineering methods accept a columns parameter to only transform the selected features. For example, to only impute the numerical columns in the dataset we could type atom.impute(strat_num=\"mean\", columns=atom.numerical). The parameter accepts column names, column indices, dtypes or a slice object.

How can I compare the same model on different datasets?

In many occasions you might want to test how a model performs on datasets processed with different pipelines. For this, atom has the branch system. Create a new branch for every new pipeline you want to test and use the plot methods to compare all models, independent of the branch it was trained on.

Can I train models through atom using a GPU?

Yes. Refer to the user guide to see what algorithms and models have a GPU implementation. Be aware that it could require additional software and hardware dependencies.

How are numerical and categorical columns differentiated?

The columns are separated using a dataframe's select_dtypes method. Numerical columns are selected using include=\"number\" whereas categorical columns are selected using exclude=\"number\".

Can I run unsupervised learning pipelines?

No. As for now, ATOM only supports supervised machine learning pipelines. However, various unsupervised algorithms can be chosen as strategy in the Pruner class to detect and remove outliers from the dataset.

Is there a way to plot multiple models in the same shap plot?

No. Unfortunately, there is no way to plot multiple models in the same shap plot since the plots are made by the shap package and passed as matplotlib.axes objects to atom. This means that it's not within the reach of this package to implement such a utility.

Can I merge a sklearn pipeline with atom?

Yes. Like any other transformer, it is possible to add a sklearn pipeline to atom using the add method. Every transformer in the pipeline is merged independently. The pipeline is not allowed to end with a model since atom manages its own models. If that is the case, add the pipeline using atom.add(pipeline[:-1]).

Is it possible to initialize atom with an existing train and test set?

Yes. If you already have a separated train and test set you can initialize atom in two ways:

  • atom = ATOMClassifier(train, test)
  • atom = ATOMClassifier((X_train, y_train), (X_test, y_test))

Make sure the train and test size have the same number of columns! If atom is initialized in any of these two ways, the test_size parameter is ignored.

Can I train the models using cross-validation?

Applying cross-validation means transforming every step of the pipeline multiple times, each with different results. Doing this would prevent ATOM from being able to show the transformation results after every pre-processing step, which means losing the ability to inspect how a transformer changed the dataset. For this reason, it is not possible to apply cross-validation until after a model has been trained. After a model has been trained, the pipeline is defined, and cross-validation can be applied using the cross_validate method. See here an example using cross-validation.

Is there a way to process datetime features?

Yes, the FeatureExtractor class can automatically extract useful features (day, month, year, etc...) from datetime columns. The extracted features are always encoded to numerical values, so they can be fed directly to a model.

"}, {"location": "getting_started/", "title": "Getting started", "text": ""}, {"location": "getting_started/#installation", "title": "Installation", "text": "

Install ATOM's newest release easily via pip:

pip install -U atom-ml\n

or via conda:

conda install -c conda-forge atom-ml\n

Note

Since atom was already taken, download the package under the name atom-ml!

Warning

ATOM makes use of many other ML libraries, making its dependency list quite long. Because of that, the installation may take longer than you are accustomed to. Be patient!

Optional dependencies

Some specific models, utility methods or plots require the installation of additional libraries. To install the optional dependencies, add [full] after the package's name.

pip install -U atom-ml[full]\n

Latest source

Sometimes, new features and bug fixes are already implemented in the development branch, but waiting for the next release to be made available. If you can't wait for that, it's possible to install the package directly from git.

pip install git+https://github.com/tvdboom/ATOM.git@development#egg=atom-ml\n

Don't forget to include #egg=atom-ml to explicitly name the project, this way pip can track metadata for it without having to have run the setup.py script.

Contributing

If you are planning to contribute to the project, you'll need the development dependencies. Install them adding [dev] after the package's name.

pip install -U atom-ml[dev]\n

Click here for a complete list of package files for all versions published on PyPI.

"}, {"location": "getting_started/#usage", "title": "Usage", "text": "

ATOM contains a variety of classes and functions to perform data cleaning, feature engineering, model training, plotting and much more. The easiest way to use everything ATOM has to offer is through one of the main classes:

  • ATOMClassifier for classification tasks.
  • ATOMForecaster for forecasting tasks.
  • ATOMRegressor for regression tasks.

Let's walk you through an example. Click on the SageMaker Studio Lab badge on top of this section to run this example yourself.

Make the necessary imports and load the data.

>>> import pandas as pd\n>>> from atom import ATOMClassifier\n\n>>> # Load the Australian Weather dataset\n>>> X = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)\n>>> print(X.head())\n\n           Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm RainToday  RainTomorrow\n0  MelbourneAirport     18.0     26.9      21.4          7.0       8.9         SSE           41.0          W        SSE           9.0          20.0         95.0         54.0       1019.5       1017.0       8.0       5.0     18.5     26.0       Yes             0\n1          Adelaide     17.2     23.4       0.0          NaN       NaN           S           41.0          S        WSW          13.0          19.0         59.0         36.0       1015.7       1015.7       NaN       NaN     17.7     21.9        No             0\n2            Cairns     18.6     24.6       7.4          3.0       6.1         SSE           54.0        SSE         SE          26.0          35.0         78.0         57.0       1018.7       1016.6       3.0       3.0     20.8     24.1       Yes             0\n3          Portland     13.6     16.8       4.2          1.2       0.0         ESE           39.0        ESE        ESE          17.0          15.0         76.0         74.0       1021.4       1020.5       7.0       8.0     15.6     16.0       Yes             1\n4           Walpole     16.4     19.9       0.0          NaN       NaN          SE           44.0         SE         SE          19.0          30.0         78.0         70.0       1019.4       1018.9       NaN       NaN     17.4     18.1        No             0\n

Initialize the ATOMClassifier or ATOMRegressor class. These two classes are convenient wrappers for the whole machine learning pipeline. Contrary to sklearn's API, they are initialized providing the data you want to manipulate.

>>> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (100, 22)\nTrain set size: 80\nTest set size: 20\n-------------------------------------\nMemory: 17.73 kB\nScaled: False\nMissing values: 193 (8.8%)\nCategorical features: 5 (23.8%)\n

Data transformations are applied through atom's methods. For example, calling the impute method will initialize an Imputer instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (no fit and transform commands necessary).

>>> atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \n\nFitting Imputer...\nImputing missing values...\n --> Imputing 1 missing values with median (0.0) in feature Rainfall.\n --> Imputing 36 missing values with median (4.8) in feature Evaporation.\n --> Imputing 38 missing values with median (8.45) in feature Sunshine.\n --> Imputing 8 missing values with most_frequent (SSE) in feature WindGustDir.\n --> Imputing 8 missing values with median (41.0) in feature WindGustSpeed.\n --> Imputing 7 missing values with most_frequent (ESE) in feature WindDir9am.\n --> Imputing 2 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 1 missing values with median (74.0) in feature Humidity9am.\n --> Imputing 6 missing values with median (1017.55) in feature Pressure9am.\n --> Imputing 6 missing values with median (1015.4) in feature Pressure3pm.\n --> Imputing 38 missing values with median (5.5) in feature Cloud9am.\n --> Imputing 40 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 1 missing values with median (17.2) in feature Temp9am.\n --> Imputing 1 missing values with most_frequent (No) in feature RainToday.\n\n>>> atom.encode(strategy=\"Target\", max_onehot=8)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 42 classes.\n   --> Handling 2 unknown classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n   --> Handling 1 unknown classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n

Similarly, models are trained and evaluated using the run method. Here, we fit both a LogisticRegression and LinearDiscriminantAnalysis model, and apply hyperparameter tuning.

>>> atom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)\n\n\nTraining ========================= >>\nModels: LR, LDA\nMetric: auc\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |      l2 |  1.1302 |     sag |      730 |      0.3 |  0.5417 |   0.5417 |     0.093s |  0.093s | COMPLETE |\n| 1     |    None |  0.1544 |   lbfgs |      120 |      0.5 |  0.8542 |   0.8542 |     0.092s |  0.185s | COMPLETE |\n| 2     |      l2 |  0.0027 |     sag |      460 |      0.4 |  0.5625 |   0.8542 |     0.090s |  0.275s | COMPLETE |\n| 3     |      l2 |  0.0062 |   lbfgs |      800 |      0.8 |  0.6042 |   0.8542 |     0.090s |  0.365s | COMPLETE |\n| 4     | elast.. |  4.2724 |    saga |      530 |      0.1 |  0.6042 |   0.8542 |     0.096s |  0.461s | COMPLETE |\n| 5     |      l2 |  1.3274 | newto.. |      680 |      0.3 |  0.5625 |   0.8542 |     0.093s |  0.555s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 1\nBest parameters:\n --> penalty: None\n --> C: 0.1544\n --> solver: lbfgs\n --> max_iter: 120\n --> l1_ratio: 0.5\nBest evaluation --> auc: 0.8542\nTime elapsed: 0.555s\nFit ---------------------------------------------\nTrain evaluation --> auc: 1.0\nTest evaluation --> auc: 0.4133\nTime elapsed: 0.074s\n-------------------------------------------------\nTime: 0.629s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |     svd |      None |  0.6458 |   0.6458 |     0.086s |  0.086s | COMPLETE |\n| 1     |    lsqr |       0.7 |  0.9375 |   0.9375 |     0.081s |  0.167s | COMPLETE |\n| 2     |     svd |       nan |  0.6458 |   0.9375 |     0.001s |  0.168s | COMPLETE |\n| 3     |    lsqr |       0.8 |   0.625 |   0.9375 |     0.079s |  0.247s | COMPLETE |\n| 4     |     svd |       nan |  0.6458 |   0.9375 |     0.000s |  0.247s | COMPLETE |\n| 5     |   eigen |       0.8 |    0.75 |   0.9375 |     0.078s |  0.326s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 1\nBest parameters:\n --> solver: lsqr\n --> shrinkage: 0.7\nBest evaluation --> auc: 0.9375\nTime elapsed: 0.326s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.8576\nTest evaluation --> auc: 0.8933\nTime elapsed: 0.016s\n-------------------------------------------------\nTime: 0.342s\n\n\nFinal results ==================== >>\nTotal time: 1.005s\n-------------------------------------\nLogisticRegression         --> auc: 0.4133 ~\nLinearDiscriminantAnalysis --> auc: 0.8933 !\n

And lastly, analyze the results.

>>> print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR       0.60  0.2793  0.4000  0.0000      0.0 -0.2425       0.00     0.0  0.4667\nLDA      0.85  0.7944  0.7667  0.6667      0.5  0.5774       0.75     0.6  0.9067\n\n\n>>> atom.plot_lift()\n
"}, {"location": "license/", "title": "MIT License", "text": "

Copyright \u00a9 2023 Mavs

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"}, {"location": "API/ATOM/atomclassifier/", "title": "ATOMClassifier", "text": "

class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for classification tasks.

Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

  • X
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

This parameter is ignored if the target column is provided through arrays.

index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

  • If False: Reset to RangeIndex.
  • If True: Use the provided index.
  • If int: Position of the column to use as index.
  • If str: Name of the column to use as index.
  • If sequence: Array with shape=(n_samples,) to use as index.

test_size: int or float, default=0.2

  • If <=1: Fraction of the dataset to include in the test set.
  • If >1: Number of rows to include in the test set.

This parameter is ignored if the test set is provided through arrays.

holdout_size: int, float or None, default=None

  • If None: No holdout data set is kept apart.
  • If <=1: Fraction of the dataset to include in the holdout set.
  • If >1: Number of rows to include in the holdout set.

This parameter is ignored if the holdout set is provided through arrays.

shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

stratify: bool, int, str or sequence, default=True Handle stratification of the target classes over the data sets.

  • If False: The data is split randomly.
  • If True: The data is stratified over the target column.
  • Else: Name or position of the columns to stratify by. The columns can't contain NaN values.

This parameter is ignored if shuffle=False or if the test set is provided through arrays.

For multioutput tasks, stratification is applied to the joint target columns.

n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

  • If <=1: Fraction of the dataset to select.
  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 + n_jobs.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"sklearnex\"
    • \"cuml\"

backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

  • \"loky\": Single-node, process-based parallelism.
  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
  • \"threading\": Single-node, thread-based parallelism.
  • \"ray\": Multi-node, process-based parallelism.

memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

  • If False: No caching is performed.
  • If True: A default temp directory is used.
  • If str: Path to the caching directory.
  • If Path: A pathlib.Path to the caching directory.
  • If Memory: Object with the joblib.Memory interface.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

warnings: bool or str, default=False

  • If True: Default warning action (equal to \"once\").
  • If False: Suppress all warnings (equal to \"ignore\").
  • If str: One of python's warnings filters.

Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic name.
  • If Path: A pathlib.Path to the log file.
  • Else: Python logging.Logger instance.

experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

See Also

ATOMForecaster Main class for forecasting tasks.

ATOMRegressor Main class for regression tasks.

"}, {"location": "API/ATOM/atomclassifier/#example", "title": "Example", "text": "
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 180 (1.3%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n>>> atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 22 features from the dataset.\n   --> Dropping feature mean area (rank 7).\n   --> Dropping feature mean compactness (rank 2).\n   --> Dropping feature mean fractal dimension (rank 6).\n   --> Dropping feature smoothness error (rank 9).\n   --> Dropping feature concave points error (rank 4).\n   --> Dropping feature fractal dimension error (rank 8).\n   --> Dropping feature worst radius (rank 3).\n   --> Dropping feature worst area (rank 5).\n\n\n>>> # Train models\n>>> atom.run(models=[\"LR\", \"RF\", \"XGB\"])\n\n\nTraining ========================= >>\nModels: LR, RF, XGB\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9878\nTest evaluation --> f1: 0.9859\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9714\nTime elapsed: 0.251s\n-------------------------------------------------\nTime: 0.251s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9718\nTime elapsed: 0.412s\n-------------------------------------------------\nTime: 0.412s\n\n\nFinal results ==================== >>\nTotal time: 0.759s\n-------------------------------------\nLogisticRegression --> f1: 0.9859 !\nRandomForest       --> f1: 0.9714\nXGBoost            --> f1: 0.9718\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     f1_train  f1_test  time_fit      time\nLR     0.9878   0.9859  0.086078  0.086078\nRF     1.0000   0.9714  0.251238  0.251238\nXGB    1.0000   0.9718  0.412373  0.412373\n\n\n>>> print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR     0.9823  0.9975  0.9811  0.9859   0.9722  0.9621     0.9859  0.9859  0.9960\nRF     0.9646  0.9704  0.9670  0.9714   0.9444  0.9256     0.9855  0.9577  0.9670\nXGB    0.9646  0.9622  0.9621  0.9718   0.9452  0.9242     0.9718  0.9718  0.9621\n
"}, {"location": "API/ATOM/atomclassifier/#magic-methods", "title": "Magic methods", "text": "

The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

  • __repr__: Prints an overview of atom's branches, models and metric.
  • __len__: Returns the length of the dataset.
  • __iter__: Iterate over the pipeline's transformers.
  • __contains__: Checks if the provided item is a column in the dataset.
  • __getitem__: Access a branch, model, column or subset of the dataset.

"}, {"location": "API/ATOM/atomclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomclassifier/#data-attributes", "title": "Data attributes", "text": "

The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

Attributespipeline: PipelinePipeline of transforms.

Tip

Use the plot_pipeline method to visualize the pipeline.

mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

This property is unavailable for sparse datasets. classes: DataFrameDistribution of target classes per data set.

This property is only available for classification tasks. n_classes: int | numpy.integer | Series | modin.pandas.series.SeriesNumber of classes in the target column(s).

This property is only available for classification tasks.

"}, {"location": "API/ATOM/atomclassifier/#utility-attributes", "title": "Utility attributes", "text": "

The utility attributes are used to access information about the models in the instance after training.

Attributesbranch: BranchCurrent active branch.

Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

All durations are in seconds. Possible values include:

  • [metric]_ht: Score obtained by the hyperparameter tuning.
  • time_ht: Duration of the hyperparameter tuning.
  • [metric]_train: Metric score on the train set.
  • [metric]_test: Metric score on the test set.
  • time_fit: Duration of the model fitting on the train set.
  • [metric]_bootstrap: Mean score on the bootstrapped samples.
  • time_bootstrap: Duration of the bootstrapping.
  • time: Total duration of the run.

"}, {"location": "API/ATOM/atomclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

"}, {"location": "API/ATOM/atomclassifier/#plot-attributes", "title": "Plot attributes", "text": "

The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

Attributespalette: str | Sequence[str]Color palette.

Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

"}, {"location": "API/ATOM/atomclassifier/#utility-methods", "title": "Utility methods", "text": "

Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

Warning

  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

Note

If the transform method doesn't return a dataframe:

  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

Note

If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

**fit_params Additional keyword arguments for the transformer's fit method.

method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

Note

This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

Tip

Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

kw_args: dict or None, default=None Additional keyword arguments for the function.

inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

method available_models()[source]Give an overview of the available predefined models.

Returnspd.DataFrame Information about the available predefined models. Columns include:

  • acronym: Model's acronym (used to call the model).
  • model: Name of the model's class.
  • estimator: The model's underlying estimator.
  • module: The estimator's module.
  • needs_scaling: Whether the model requires feature scaling.
  • accepts_sparse: Whether the model accepts sparse matrices.
  • native_multilabel: Whether the model has native support for multilabel tasks.
  • native_multioutput: Whether the model has native support for multioutput tasks.
  • has_validation: Whether the model has in-training validation.
  • supports_engines: Engines supported by the model.

method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

Parametersrows: int, default=1 Number of plots in length.

cols: int, default=2 Number of plots in width.

horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

title: str, dict or None, default=None Title for the plot.

  • If None, no title is shown.
  • If str, text for the title.
  • If dict, title configuration.

legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

  • If None: No legend is shown.
  • If str: Location where to show the legend.
  • If dict: Legend configuration.

figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

display: bool, default=True Whether to render the plot.

Yieldsgo.Figure Plot object.

method clear()[source]Reset attributes and clear cache from all models.

Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

  • In-training validation scores
  • Shap values
  • App instance
  • Dashboard instance
  • Calculated holdout data sets

method delete(models=None)[source]Delete models.

If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

Tip

Use the plot_distribution method to plot a column's distribution.

Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame Statistic results with multiindex levels:

  • dist: Name of the distribution.
  • stat: Statistic results:
    • score: KS-test score.
    • p_value: Corresponding p-value.

method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

Warning

This method can be slow for large datasets.

Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

  • If str: Name of the data set to report.
  • If sequence: Names of two data sets to compare.
  • If dict: Names of up to two data sets with corresponding selection of rows to report.

target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

  • The task is binary or multilabel classification.
  • The model has a predict_proba method.
  • The metric evaluates predicted probabilities.

For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

Returnspd.DataFrame Scores of the models.

method export_pipeline(model=None)[source]Export the internal pipeline.

This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

ReturnsPipeline Current branch as a sklearn-like Pipeline object.

method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsseries Sequence of weights with shape=(n_samples,).

method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Original feature set. Only returned if provided.

series or dataframe Original target column. Only returned if provided.

function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

Info

The loaded instance's current branch is the same branch as it was when saved.

Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

  • X
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsatom Unpickled atom instance.

method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

Parametersother: Runner Instance with which to merge. Should be of the same class as self.

suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

method update_layout(**kwargs)[source]Update the properties of the plot's layout.

Recursively update the structure of the original layout with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_layout method.

method update_traces(**kwargs)[source]Update the properties of the plot's traces.

Recursively update the structure of the original traces with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_traces method.

method reset(hard=False)[source]Reset the instance to it's initial state.

Deletes all branches and models. The dataset is also reset to its form after initialization.

Parametershard: bool, default=False If True, flushes completely the cache.

method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

**kwargs Additional keyword arguments for pandas' to_csv method.

method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns.

method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

method stats()[source]Display basic information about the dataset.

method status()[source]Get an overview of the branches and models.

This method prints the same information as the __repr__ and also saves it to the logger.

method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

**kwargs Additional keyword arguments for sklearn's voting instance.

"}, {"location": "API/ATOM/atomclassifier/#data-cleaning", "title": "Data cleaning", "text": "

The data cleaning methods can help you scale the data, handle missing values, categorical columns, outliers and unbalanced datasets. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

Tip

Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

balanceBalance the number of rows per class in the target column.cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

method balance(strategy=\"adasyn\", **kwargs)[source]Balance the number of rows per class in the target column.

When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set.

See the Balancer class for a description of the parameters.

Warning

  • The balance method does not support multioutput tasks.
  • This transformation is only applied to the training set in order to maintain the original distribution of target classes in the test set.

Tip

Use atom's classes attribute for an overview of the target class distribution per data set.

method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

Use the parameters to choose which transformations to perform. The available steps are:

  • Convert dtypes to the best possible types.
  • Drop columns with specific data types.
  • Remove characters from column names.
  • Strip categorical features from spaces.
  • Drop duplicate rows.
  • Drop rows with missing values in the target column.
  • Encode the target column (ignored for regression tasks).

See the Cleaner class for a description of the parameters.

method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

See the Discretizer class for a description of the parameters.

Tip

Use the plot_distribution method to visualize a column's distribution and decide on the bins.

method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

The encoding type depends on the number of classes in the column:

  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
  • If n_classes > max_onehot, use strategy-encoding.

Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

See the Encoder class for a description of the parameters.

Note

This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

Tip

Use the categorical attribute for a list of the categorical features in the dataset.

method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

See the Imputer class for a description of the parameters.

Tip

Use the nans attribute to check the amount of missing values per column.

method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

See the Normalizer class for a description of the parameters.

Tip

Use the plot_distribution method to examine a column's distribution.

method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

See the Pruner class for a description of the parameters.

Note

This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

Tip

Use the outliers attribute to check the number of outliers per column.

method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

Apply one of sklearn's scalers. Categorical columns are ignored.

See the Scaler class for a description of the parameters.

Tip

Use the scaled attribute to check whether the dataset is scaled.

"}, {"location": "API/ATOM/atomclassifier/#nlp", "title": "NLP", "text": "

The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

See the TextCleaner class for a description of the parameters.

method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

See the TextNormalizer class for a description of the parameters.

method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

See the Tokenizer class for a description of the parameters.

method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

See the Vectorizer class for a description of the parameters.

"}, {"location": "API/ATOM/atomclassifier/#feature-engineering", "title": "Feature engineering", "text": "

To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

See the FeatureExtractor class for a description of the parameters.

method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

Create new combinations of existing features to capture the non-linear relations between the original features.

See the FeatureGenerator class for a description of the parameters.

method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

See the FeatureGrouper class for a description of the parameters.

Tip

Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

See the FeatureSelector class for a description of the parameters.

Note

  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

"}, {"location": "API/ATOM/atomclassifier/#training", "title": "Training", "text": "

The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

The following steps are applied to every model:

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the DirectClassifier or DirectRegressor class for a description of the parameters.

method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

"}, {"location": "API/ATOM/atomforecaster/", "title": "ATOMForecaster", "text": "

class atom.api.ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for forecasting tasks.

Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

Parameters*arrays: sequence of indexables Dataset containing exogeneous features and time series. Allowed formats are:

  • X
  • y
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Exogeneous feature set corresponding to y, with shape=(n_samples, n_features).

y: int, str or sequence Time series.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

y: int, str, dict, sequence or dataframe, default=-1 Time series.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

This parameter is ignored if the time series is provided through arrays.

test_size: int or float, default=0.2

  • If <=1: Fraction of the dataset to include in the test set.
  • If >1: Number of rows to include in the test set.

This parameter is ignored if the test set is provided through arrays.

holdout_size: int, float or None, default=None

  • If None: No holdout data set is kept apart.
  • If <=1: Fraction of the dataset to include in the holdout set.
  • If >1: Number of rows to include in the holdout set.

This parameter is ignored if the holdout set is provided through arrays.

n_rows: int or float, default=1 Subsample of the dataset to use. The cut is made from the head of the dataset (older entries are dropped when sorted by date ascending). The default value selects all rows.

  • If <=1: Fraction of the dataset to select.
  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 + n_jobs.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"sklearnex\"
    • \"cuml\"

backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

  • \"loky\": Single-node, process-based parallelism.
  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
  • \"threading\": Single-node, thread-based parallelism.
  • \"ray\": Multi-node, process-based parallelism.

memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

  • If False: No caching is performed.
  • If True: A default temp directory is used.
  • If str: Path to the caching directory.
  • If Path: A pathlib.Path to the caching directory.
  • If Memory: Object with the joblib.Memory interface.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

warnings: bool or str, default=False

  • If True: Default warning action (equal to \"once\").
  • If False: Suppress all warnings (equal to \"ignore\").
  • If str: One of python's warnings filters.

Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic name.
  • If Path: A pathlib.Path to the log file.
  • Else: Python logging.Logger instance.

experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

See Also

ATOMClassifier Main class for classification tasks.

ATOMRegressor Main class for regression tasks.

"}, {"location": "API/ATOM/atomforecaster/#example", "title": "Example", "text": "
>>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> # Initialize atom\n>>> atom = ATOMForecaster(y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Univariate forecast.\n\nDataset stats ==================== >>\nShape: (144, 1)\nTrain set size: 116\n --> From: 1949-01  To: 1958-08\nTest set size: 28\n --> From: 1958-09  To: 1960-12\n-------------------------------------\nMemory: 6.47 kB\nDuplicates: 26 (18.1%)\n\n\n\n>>> # Train models\n>>> atom.run(models=[\"NF\", \"ES\", \"ETS\"])\n\n\nTraining ========================= >>\nModels: NF, ES, ETS\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0864\nTest evaluation --> mape: -0.2303\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.090s\n-------------------------------------\nNaiveForecaster      --> mape: -0.2305\nExponentialSmoothing --> mape: -0.2303 !\nETS                  --> mape: -0.2305\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     mape_train  mape_test  time_fit      time\nNF      -0.0858    -0.2305  0.025023  0.025023\nES      -0.0864    -0.2303  0.042052  0.042052\nETS     -0.0858    -0.2305  0.021019  0.021019\n\n\n>>> print(atom.evaluate())\n\n         mae    mape         mse      r2      rmse\nNF  -91.8571 -0.2305 -10656.7143 -0.7278 -103.2314\nES  -91.8163 -0.2303 -10647.1506 -0.7263 -103.1850\nETS -91.8563 -0.2305 -10656.5266 -0.7278 -103.2305\n
"}, {"location": "API/ATOM/atomforecaster/#magic-methods", "title": "Magic methods", "text": "

The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

  • __repr__: Prints an overview of atom's branches, models and metric.
  • __len__: Returns the length of the dataset.
  • __iter__: Iterate over the pipeline's transformers.
  • __contains__: Checks if the provided item is a column in the dataset.
  • __getitem__: Access a branch, model, column or subset of the dataset.

"}, {"location": "API/ATOM/atomforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomforecaster/#data-attributes", "title": "Data attributes", "text": "

The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

Attributespipeline: PipelinePipeline of transforms.

Tip

Use the plot_pipeline method to visualize the pipeline.

mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

This property is unavailable for sparse datasets.

"}, {"location": "API/ATOM/atomforecaster/#utility-attributes", "title": "Utility attributes", "text": "

The utility attributes are used to access information about the models in the instance after training.

Attributesbranch: BranchCurrent active branch.

Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

All durations are in seconds. Possible values include:

  • [metric]_ht: Score obtained by the hyperparameter tuning.
  • time_ht: Duration of the hyperparameter tuning.
  • [metric]_train: Metric score on the train set.
  • [metric]_test: Metric score on the test set.
  • time_fit: Duration of the model fitting on the train set.
  • [metric]_bootstrap: Mean score on the bootstrapped samples.
  • time_bootstrap: Duration of the bootstrapping.
  • time: Total duration of the run.

"}, {"location": "API/ATOM/atomforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

"}, {"location": "API/ATOM/atomforecaster/#plot-attributes", "title": "Plot attributes", "text": "

The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

Attributespalette: str | Sequence[str]Color palette.

Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

"}, {"location": "API/ATOM/atomforecaster/#utility-methods", "title": "Utility methods", "text": "

Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

Warning

  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

Note

If the transform method doesn't return a dataframe:

  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

Note

If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

**fit_params Additional keyword arguments for the transformer's fit method.

method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

Note

This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

Tip

Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

kw_args: dict or None, default=None Additional keyword arguments for the function.

inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

method available_models()[source]Give an overview of the available predefined models.

Returnspd.DataFrame Information about the available predefined models. Columns include:

  • acronym: Model's acronym (used to call the model).
  • model: Name of the model's class.
  • estimator: The model's underlying estimator.
  • module: The estimator's module.
  • needs_scaling: Whether the model requires feature scaling.
  • accepts_sparse: Whether the model accepts sparse matrices.
  • native_multilabel: Whether the model has native support for multilabel tasks.
  • native_multioutput: Whether the model has native support for multioutput tasks.
  • has_validation: Whether the model has in-training validation.
  • supports_engines: Engines supported by the model.

method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

Parametersrows: int, default=1 Number of plots in length.

cols: int, default=2 Number of plots in width.

horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

title: str, dict or None, default=None Title for the plot.

  • If None, no title is shown.
  • If str, text for the title.
  • If dict, title configuration.

legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

  • If None: No legend is shown.
  • If str: Location where to show the legend.
  • If dict: Legend configuration.

figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

display: bool, default=True Whether to render the plot.

Yieldsgo.Figure Plot object.

method clear()[source]Reset attributes and clear cache from all models.

Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

  • In-training validation scores
  • Shap values
  • App instance
  • Dashboard instance
  • Calculated holdout data sets

method delete(models=None)[source]Delete models.

If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

Tip

Use the plot_distribution method to plot a column's distribution.

Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame Statistic results with multiindex levels:

  • dist: Name of the distribution.
  • stat: Statistic results:
    • score: KS-test score.
    • p_value: Corresponding p-value.

method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

Warning

This method can be slow for large datasets.

Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

  • If str: Name of the data set to report.
  • If sequence: Names of two data sets to compare.
  • If dict: Names of up to two data sets with corresponding selection of rows to report.

target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

  • The task is binary or multilabel classification.
  • The model has a predict_proba method.
  • The metric evaluates predicted probabilities.

For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

Returnspd.DataFrame Scores of the models.

method export_pipeline(model=None)[source]Export the internal pipeline.

This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

ReturnsPipeline Current branch as a sklearn-like Pipeline object.

method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsseries Sequence of weights with shape=(n_samples,).

method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Original feature set. Only returned if provided.

series or dataframe Original target column. Only returned if provided.

function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

Info

The loaded instance's current branch is the same branch as it was when saved.

Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

  • X
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsatom Unpickled atom instance.

method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

Parametersother: Runner Instance with which to merge. Should be of the same class as self.

suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

method update_layout(**kwargs)[source]Update the properties of the plot's layout.

Recursively update the structure of the original layout with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_layout method.

method update_traces(**kwargs)[source]Update the properties of the plot's traces.

Recursively update the structure of the original traces with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_traces method.

method reset(hard=False)[source]Reset the instance to it's initial state.

Deletes all branches and models. The dataset is also reset to its form after initialization.

Parametershard: bool, default=False If True, flushes completely the cache.

method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

**kwargs Additional keyword arguments for pandas' to_csv method.

method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns.

method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

method stats()[source]Display basic information about the dataset.

method status()[source]Get an overview of the branches and models.

This method prints the same information as the __repr__ and also saves it to the logger.

method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

**kwargs Additional keyword arguments for sklearn's voting instance.

"}, {"location": "API/ATOM/atomforecaster/#data-cleaning", "title": "Data cleaning", "text": "

The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

Tip

Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

Use the parameters to choose which transformations to perform. The available steps are:

  • Convert dtypes to the best possible types.
  • Drop columns with specific data types.
  • Remove characters from column names.
  • Strip categorical features from spaces.
  • Drop duplicate rows.
  • Drop rows with missing values in the target column.
  • Encode the target column (ignored for regression tasks).

See the Cleaner class for a description of the parameters.

method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

See the Discretizer class for a description of the parameters.

Tip

Use the plot_distribution method to visualize a column's distribution and decide on the bins.

method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

The encoding type depends on the number of classes in the column:

  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
  • If n_classes > max_onehot, use strategy-encoding.

Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

See the Encoder class for a description of the parameters.

Note

This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

Tip

Use the categorical attribute for a list of the categorical features in the dataset.

method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

See the Imputer class for a description of the parameters.

Tip

Use the nans attribute to check the amount of missing values per column.

method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

See the Normalizer class for a description of the parameters.

Tip

Use the plot_distribution method to examine a column's distribution.

method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

See the Pruner class for a description of the parameters.

Note

This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

Tip

Use the outliers attribute to check the number of outliers per column.

method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

Apply one of sklearn's scalers. Categorical columns are ignored.

See the Scaler class for a description of the parameters.

Tip

Use the scaled attribute to check whether the dataset is scaled.

"}, {"location": "API/ATOM/atomforecaster/#nlp", "title": "NLP", "text": "

The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

See the TextCleaner class for a description of the parameters.

method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

See the TextNormalizer class for a description of the parameters.

method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

See the Tokenizer class for a description of the parameters.

method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

See the Vectorizer class for a description of the parameters.

"}, {"location": "API/ATOM/atomforecaster/#feature-engineering", "title": "Feature engineering", "text": "

To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

See the FeatureExtractor class for a description of the parameters.

method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

Create new combinations of existing features to capture the non-linear relations between the original features.

See the FeatureGenerator class for a description of the parameters.

method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

See the FeatureGrouper class for a description of the parameters.

Tip

Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

See the FeatureSelector class for a description of the parameters.

Note

  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

"}, {"location": "API/ATOM/atomforecaster/#training", "title": "Training", "text": "

The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

The following steps are applied to every model:

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the DirectClassifier or DirectRegressor class for a description of the parameters.

method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

"}, {"location": "API/ATOM/atommodel/", "title": "ATOMModel", "text": "

function atom.api.ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)[source]Convert an estimator to a model that can be ingested by atom.

This function adds the relevant attributes to the estimator so that they can be used by atom. Note that only estimators that follow sklearn's API are compatible.

Read more about custom models in the user guide.

Parametersestimator: Predictor Custom estimator. Should implement a fit and predict method.

name: str or None, default=None Name for the model. This is the value used to call the model from atom. The value should start with the model's acronym when specified. If None, the capital letters of the estimator's name are used (only if two or more, else it uses the entire name).

acronym: str or None, default=None Model's acronym. If None, it uses the model's name. Specify this parameter when you want to train multiple custom models that share the same estimator.

needs_scaling: bool, default=False Whether the model should use automated feature scaling.

native_multilabel: bool, default=False Whether the model has native support for multilabel tasks. If False and the task is multilabel, a multilabel meta-estimator is wrapper around the estimator.

native_multioutput: bool, default=False Whether the model has native support for multioutput tasks. If False and the task is multioutput, a multioutput meta-estimator is wrapped around the estimator.

has_validation: str or None, default=None Whether the model allows in-training validation.

  • If None: No support for in-training validation.
  • If str: Name of the estimator's parameter that states the number of iterations, e.g., n_estimators for RandomForestClassifier.

ReturnsPredictor Estimator with provided information. Provide this instance to the models parameter of the run method.

"}, {"location": "API/ATOM/atommodel/#example", "title": "Example", "text": "
>>> from atom import ATOMRegressor, ATOMModel\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> ransac = ATOMModel(\n...     estimator=RANSACRegressor(),\n...     name=\"RANSAC\",\n...     needs_scaling=False,\n... )\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 12 (0.3%)\n\n\n>>> atom.run(ransac)\n\n\nTraining ========================= >>\nModels: RANSAC\nMetric: r2\n\n\nResults for RANSACRegressor:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.2946\nTest evaluation --> r2: 0.3787\nTime elapsed: 0.059s\n-------------------------------------------------\nTime: 0.059s\n\n\nFinal results ==================== >>\nTotal time: 0.060s\n-------------------------------------\nRANSACRegressor --> r2: 0.3787\n
"}, {"location": "API/ATOM/atomregressor/", "title": "ATOMRegressor", "text": "

class atom.api.ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for regression tasks.

Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

  • X
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

This parameter is ignored if the target column is provided through arrays.

index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

  • If False: Reset to RangeIndex.
  • If True: Use the provided index.
  • If int: Position of the column to use as index.
  • If str: Name of the column to use as index.
  • If sequence: Array with shape=(n_samples,) to use as index.

test_size: int or float, default=0.2

  • If <=1: Fraction of the dataset to include in the test set.
  • If >1: Number of rows to include in the test set.

This parameter is ignored if the test set is provided through arrays.

holdout_size: int, float or None, default=None

  • If None: No holdout data set is kept apart.
  • If <=1: Fraction of the dataset to include in the holdout set.
  • If >1: Number of rows to include in the holdout set.

This parameter is ignored if the holdout set is provided through arrays.

shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

  • If <=1: Fraction of the dataset to select.
  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 + n_jobs.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"sklearnex\"
    • \"cuml\"

backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

  • \"loky\": Single-node, process-based parallelism.
  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
  • \"threading\": Single-node, thread-based parallelism.
  • \"ray\": Multi-node, process-based parallelism.

memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

  • If False: No caching is performed.
  • If True: A default temp directory is used.
  • If str: Path to the caching directory.
  • If Path: A pathlib.Path to the caching directory.
  • If Memory: Object with the joblib.Memory interface.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

warnings: bool or str, default=False

  • If True: Default warning action (equal to \"once\").
  • If False: Suppress all warnings (equal to \"ignore\").
  • If str: One of python's warnings filters.

Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic name.
  • If Path: A pathlib.Path to the log file.
  • Else: Python logging.Logger instance.

experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

See Also

ATOMClassifier Main class for classification tasks.

ATOMForecaster Main class for forecasting tasks.

"}, {"location": "API/ATOM/atomregressor/#example", "title": "Example", "text": "
>>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMRegressor(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 11 (0.3%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.feature_selection(strategy=\"rfecv\", solver=\"xgb\", n_features=12)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfecv selected 10 features from the dataset.\n\n\n>>> # Train models\n>>> atom.run(models=[\"OLS\", \"RF\", \"XGB\"])\n\n\nTraining ========================= >>\nModels: OLS, RF, XGB\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5313\nTest evaluation --> r2: 0.4452\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9203\nTest evaluation --> r2: 0.3471\nTime elapsed: 0.434s\n-------------------------------------------------\nTime: 0.434s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> r2: 1.0\nTest evaluation --> r2: 0.2881\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== >>\nTotal time: 0.645s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.4452 !\nRandomForest         --> r2: 0.3471 ~\nXGBoost              --> r2: 0.2881 ~\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5313   0.4452  0.020018  0.020018\nRF     0.9203   0.3471  0.434395  0.434395\nXGB    1.0000   0.2881  0.187170  0.187170\n\n\n>>> print(atom.evaluate())\n\n         mae    mape        mse      r2     rmse\nOLS -45.1949 -0.4267 -3172.9439  0.4452 -56.3289\nRF  -49.8684 -0.4612 -3733.6766  0.3471 -61.1038\nXGB -52.0370 -0.4708 -4071.0416  0.2881 -63.8047\n
"}, {"location": "API/ATOM/atomregressor/#magic-methods", "title": "Magic methods", "text": "

The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

  • __repr__: Prints an overview of atom's branches, models and metric.
  • __len__: Returns the length of the dataset.
  • __iter__: Iterate over the pipeline's transformers.
  • __contains__: Checks if the provided item is a column in the dataset.
  • __getitem__: Access a branch, model, column or subset of the dataset.

"}, {"location": "API/ATOM/atomregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomregressor/#data-attributes", "title": "Data attributes", "text": "

The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

Attributespipeline: PipelinePipeline of transforms.

Tip

Use the plot_pipeline method to visualize the pipeline.

mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

This property is unavailable for sparse datasets.

"}, {"location": "API/ATOM/atomregressor/#utility-attributes", "title": "Utility attributes", "text": "

The utility attributes are used to access information about the models in the instance after training.

Attributesbranch: BranchCurrent active branch.

Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

All durations are in seconds. Possible values include:

  • [metric]_ht: Score obtained by the hyperparameter tuning.
  • time_ht: Duration of the hyperparameter tuning.
  • [metric]_train: Metric score on the train set.
  • [metric]_test: Metric score on the test set.
  • time_fit: Duration of the model fitting on the train set.
  • [metric]_bootstrap: Mean score on the bootstrapped samples.
  • time_bootstrap: Duration of the bootstrapping.
  • time: Total duration of the run.

"}, {"location": "API/ATOM/atomregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

"}, {"location": "API/ATOM/atomregressor/#plot-attributes", "title": "Plot attributes", "text": "

The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

Attributespalette: str | Sequence[str]Color palette.

Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

"}, {"location": "API/ATOM/atomregressor/#utility-methods", "title": "Utility methods", "text": "

Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

Warning

  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

Note

If the transform method doesn't return a dataframe:

  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

Note

If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

**fit_params Additional keyword arguments for the transformer's fit method.

method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

Note

This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

Tip

Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

kw_args: dict or None, default=None Additional keyword arguments for the function.

inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

method available_models()[source]Give an overview of the available predefined models.

Returnspd.DataFrame Information about the available predefined models. Columns include:

  • acronym: Model's acronym (used to call the model).
  • model: Name of the model's class.
  • estimator: The model's underlying estimator.
  • module: The estimator's module.
  • needs_scaling: Whether the model requires feature scaling.
  • accepts_sparse: Whether the model accepts sparse matrices.
  • native_multilabel: Whether the model has native support for multilabel tasks.
  • native_multioutput: Whether the model has native support for multioutput tasks.
  • has_validation: Whether the model has in-training validation.
  • supports_engines: Engines supported by the model.

method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

Parametersrows: int, default=1 Number of plots in length.

cols: int, default=2 Number of plots in width.

horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

title: str, dict or None, default=None Title for the plot.

  • If None, no title is shown.
  • If str, text for the title.
  • If dict, title configuration.

legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

  • If None: No legend is shown.
  • If str: Location where to show the legend.
  • If dict: Legend configuration.

figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

display: bool, default=True Whether to render the plot.

Yieldsgo.Figure Plot object.

method clear()[source]Reset attributes and clear cache from all models.

Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

  • In-training validation scores
  • Shap values
  • App instance
  • Dashboard instance
  • Calculated holdout data sets

method delete(models=None)[source]Delete models.

If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

Tip

Use the plot_distribution method to plot a column's distribution.

Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

Returnspd.DataFrame Statistic results with multiindex levels:

  • dist: Name of the distribution.
  • stat: Statistic results:
    • score: KS-test score.
    • p_value: Corresponding p-value.

method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

Warning

This method can be slow for large datasets.

Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

  • If str: Name of the data set to report.
  • If sequence: Names of two data sets to compare.
  • If dict: Names of up to two data sets with corresponding selection of rows to report.

target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

  • The task is binary or multilabel classification.
  • The model has a predict_proba method.
  • The metric evaluates predicted probabilities.

For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

Returnspd.DataFrame Scores of the models.

method export_pipeline(model=None)[source]Export the internal pipeline.

This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

ReturnsPipeline Current branch as a sklearn-like Pipeline object.

method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

Returnsseries Sequence of weights with shape=(n_samples,).

method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Original feature set. Only returned if provided.

series or dataframe Original target column. Only returned if provided.

function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

Info

The loaded instance's current branch is the same branch as it was when saved.

Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

  • X
  • X, y
  • train, test
  • train, test, holdout
  • X_train, X_test, y_train, y_test
  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
  • (X_train, y_train), (X_test, y_test)
  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsatom Unpickled atom instance.

method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

Parametersother: Runner Instance with which to merge. Should be of the same class as self.

suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

method update_layout(**kwargs)[source]Update the properties of the plot's layout.

Recursively update the structure of the original layout with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_layout method.

method update_traces(**kwargs)[source]Update the properties of the plot's traces.

Recursively update the structure of the original traces with the values in the arguments.

Parameters**kwargs Keyword arguments for the figure's update_traces method.

method reset(hard=False)[source]Reset the instance to it's initial state.

Deletes all branches and models. The dataset is also reset to its form after initialization.

Parametershard: bool, default=False If True, flushes completely the cache.

method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

**kwargs Additional keyword arguments for pandas' to_csv method.

method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns.

method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

method stats()[source]Display basic information about the dataset.

method status()[source]Get an overview of the branches and models.

This method prints the same information as the __repr__ and also saves it to the logger.

method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

Warning

Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

**kwargs Additional keyword arguments for sklearn's voting instance.

"}, {"location": "API/ATOM/atomregressor/#data-cleaning", "title": "Data cleaning", "text": "

The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

Tip

Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

Use the parameters to choose which transformations to perform. The available steps are:

  • Convert dtypes to the best possible types.
  • Drop columns with specific data types.
  • Remove characters from column names.
  • Strip categorical features from spaces.
  • Drop duplicate rows.
  • Drop rows with missing values in the target column.
  • Encode the target column (ignored for regression tasks).

See the Cleaner class for a description of the parameters.

method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

See the Discretizer class for a description of the parameters.

Tip

Use the plot_distribution method to visualize a column's distribution and decide on the bins.

method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

The encoding type depends on the number of classes in the column:

  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
  • If n_classes > max_onehot, use strategy-encoding.

Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

See the Encoder class for a description of the parameters.

Note

This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

Tip

Use the categorical attribute for a list of the categorical features in the dataset.

method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

See the Imputer class for a description of the parameters.

Tip

Use the nans attribute to check the amount of missing values per column.

method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

See the Normalizer class for a description of the parameters.

Tip

Use the plot_distribution method to examine a column's distribution.

method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

See the Pruner class for a description of the parameters.

Note

This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

Tip

Use the outliers attribute to check the number of outliers per column.

method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

Apply one of sklearn's scalers. Categorical columns are ignored.

See the Scaler class for a description of the parameters.

Tip

Use the scaled attribute to check whether the dataset is scaled.

"}, {"location": "API/ATOM/atomregressor/#nlp", "title": "NLP", "text": "

The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

See the TextCleaner class for a description of the parameters.

method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

See the TextNormalizer class for a description of the parameters.

method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

See the Tokenizer class for a description of the parameters.

method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

See the Vectorizer class for a description of the parameters.

"}, {"location": "API/ATOM/atomregressor/#feature-engineering", "title": "Feature engineering", "text": "

To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

See the FeatureExtractor class for a description of the parameters.

method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

Create new combinations of existing features to capture the non-linear relations between the original features.

See the FeatureGenerator class for a description of the parameters.

method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

See the FeatureGrouper class for a description of the parameters.

Tip

Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

See the FeatureSelector class for a description of the parameters.

Note

  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

"}, {"location": "API/ATOM/atomregressor/#training", "title": "Training", "text": "

The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

The following steps are applied to every model:

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the DirectClassifier or DirectRegressor class for a description of the parameters.

method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

The following steps are applied to every model (per iteration):

  1. Apply hyperparameter tuning (optional).
  2. Fit the model on the training set using the best combination of hyperparameters found.
  3. Evaluate the model on the test set.
  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

"}, {"location": "API/branch/branch/", "title": "Branch", "text": "

class atom.branch.branch.Branch(name, memory=None, data=None, holdout=None)[source]Object that contains the data.

A branch contains a specific pipeline, the dataset transformed through that pipeline, the models fitted on that dataset, and all data and utility attributes that refer to that dataset. Branches can be created and accessed through atom's branch attribute.

All public properties and attributes of the branch can be accessed from the parent.

Read more in the user guide.

Warning

This class should not be called directly. Branches are created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.

Parametersname: str Name of the branch.

memory: str, Memory or None, default=None Memory object for pipeline caching and to store the data when the branch is inactive.

data: DataContainer or None, default=None Data for the branch.

holdout: dataframe or None, default=None Holdout data set.

See Also

BranchManager Object that manages branches.

"}, {"location": "API/branch/branch/#example", "title": "Example", "text": "
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 177 (1.3%)\n\n\n\n>>> # Train a model\n>>> atom.run(\"RF\")\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9517\nTime elapsed: 0.236s\n-------------------------------------------------\nTime: 0.236s\n\n\nFinal results ==================== >>\nTotal time: 0.239s\n-------------------------------------\nRandomForest --> f1: 0.9517\n\n\n>>> # Change the branch and apply feature scaling\n>>> atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.run(\"RF_scaled\")\n\n\nTraining ========================= >>\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9517\nTime elapsed: 0.237s\n-------------------------------------------------\nTime: 0.237s\n\n\nFinal results ==================== >>\nTotal time: 0.240s\n-------------------------------------\nRandomForest --> f1: 0.9517\n\n\n>>> # Compare the models\n>>> atom.plot_roc()\n
"}, {"location": "API/branch/branch/#attributes", "title": "Attributes", "text": "

Attributespipeline: PipelinePipeline of transforms.

Tip

Use the plot_pipeline method to visualize the pipeline.

mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

"}, {"location": "API/branch/branch/#methods", "title": "Methods", "text": "

loadLoad the branch's data from memory.storeStore the branch's data as a pickle in memory.

method load(assign=True)[source]Load the branch's data from memory.

This method is used to restore the data of inactive branches.

Parametersassign: bool, default=True Whether to assign the loaded data to self.

ReturnsDataContainer or None Own data information. Returns None if no data is set.

method store(assign=True)[source]Store the branch's data as a pickle in memory.

After storage, the data is deleted, and the branch is no longer usable until load is called. This method is used to store the data for inactive branches.

Note

This method is skipped silently for branches with no memory allocation.

Parametersassign: bool, default=True Whether to assign None to the data in self.

"}, {"location": "API/branch/branchmanager/", "title": "BranchManager", "text": "

class atom.branch.branchmanager.BranchManager(memory=None)[source]Object that manages branches.

Maintains references to a series of branches and the current active branch. Additionally, always stores an 'original' branch containing the original dataset (previous to any transformations). The branches share a reference to a holdout set, not the instance self. When a memory object is specified, it stores inactive branches in memory.

Read more in the user guide.

Warning

This class should not be called directly. The BranchManager is created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.

Parametersmemory: str, Memory or None, default=None Location to store inactive branches. If None, all branches are kept in memory. This memory object is passed to the branches for pipeline caching.

Attributesbranches: ClassMap Collection of branches.

og: Branch Branch containing the original dataset. It can be any branch in branches or an internally made branch called og.

current: Branch Current active branch.

See Also

Branch Object that contains the data.

"}, {"location": "API/branch/branchmanager/#example", "title": "Example", "text": "
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 174 (1.2%)\n\n\n\n>>> # Train a model\n>>> atom.run(\"RF\")\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== >>\nTotal time: 0.232s\n-------------------------------------\nRandomForest --> f1: 0.9655\n\n\n>>> # Change the branch and apply feature scaling\n>>> atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.run(\"RF_scaled\")\n\n\nTraining ========================= >>\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9722\nTime elapsed: 0.228s\n-------------------------------------------------\nTime: 0.228s\n\n\nFinal results ==================== >>\nTotal time: 0.231s\n-------------------------------------\nRandomForest --> f1: 0.9722\n\n\n>>> # Compare the models\n>>> atom.plot_roc()\n
"}, {"location": "API/branch/branchmanager/#attributes", "title": "Attributes", "text": "

Attributesbranches: ClassMap Collection of branches.

og: Branch Branch containing the original dataset. It can be any branch in branches or an internally made branch called og.

current: Branch Current active branch.

"}, {"location": "API/branch/branchmanager/#methods", "title": "Methods", "text": "

addAdd a new branch to the manager.fillFill the current branch with data.resetReset this instance to its initial state.

method add(name, parent=None)[source]Add a new branch to the manager.

If the branch is called og (reserved name for the original branch), it's created separately and stored in memory.

Parametersname: str Name for the new branch.

parent: Branch or None, default=None Parent branch. Data and attributes from the parent are passed to the new branch.

method fill(data, holdout=None)[source]Fill the current branch with data.

Parametersdata: DataContainer New data for the current branch.

holdout: dataframe or None, default=None Holdout data set (if any).

method reset(hard=False)[source]Reset this instance to its initial state.

The initial state of the BranchManager contains a single branch called main with no data. There's no reference to an original (og) branch.

Parametershard: bool, default=False If True, flushes completely the cache.

"}, {"location": "API/data_cleaning/balancer/", "title": "Balancer", "text": "

class atom.data_cleaning.Balancer(strategy=\"ADASYN\", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Balance the number of samples per class in the target column.

When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set. Use only for classification tasks.

This class can be accessed from atom through the balance method. Read more in the user guide.

Warning

  • The clustercentroids estimator is unavailable because of incompatibilities of the APIs.
  • The Balancer class does not support multioutput tasks.

Parametersstrategy: str or estimator, default=\"ADASYN\" Type of algorithm with which to balance the dataset. Choose from the name of any estimator in the imbalanced-learn package or provide a custom instance of such.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 - value.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

**kwargs Additional keyword arguments for the strategy estimator.

Attributes[strategy]_: imblearn estimator Object (lowercase strategy) used to balance the data, e.g., balancer.adasyn_ for the default strategy.

mapping_: dict Target values mapped to their respective encoded integers.

feature_names_in_: np.ndarray Names of features seen during fit.

target_names_in_: np.ndarray Names of the target column seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Encoder Perform encoding of categorical features.

Imputer Handle missing values in the data.

Pruner Prune outliers from the data.

"}, {"location": "API/data_cleaning/balancer/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.train)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630             0.054390         0.1720  ...           107.30       740.4            0.1610            0.42250          0.50300               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690             0.094510         0.1860  ...           142.20      1493.0            0.1492            0.25360          0.37590               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699             0.047440         0.1538  ...           135.10      1320.0            0.1315            0.18060          0.20800               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686             0.027390         0.1852  ...           110.10       931.4            0.1148            0.09866          0.15470               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263             0.023080         0.1305  ...            63.34       270.0            0.1179            0.18790          0.15440               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n451        19.73         19.82          130.70     1206.0          0.10620           0.18490         0.24170             0.097400         0.1733  ...           159.80      1933.0            0.1710            0.59550          0.84890               0.25070          0.2749                  0.12970       0\n452        12.72         13.78           81.78      492.1          0.09667           0.08393         0.01288             0.019240         0.1638  ...            88.54       553.7            0.1298            0.14720          0.05233               0.06343          0.2369                  0.06922       1\n453        11.51         23.93           74.52      403.5          0.09261           0.10210         0.11120             0.041050         0.1388  ...            82.28       474.2            0.1298            0.25170          0.36300               0.09653          0.2112                  0.08732       1\n454        10.75         14.97           68.26      355.3          0.07793           0.05139         0.02251             0.007875         0.1399  ...            77.79       441.2            0.1076            0.12230          0.09755               0.03413          0.2300                  0.06769       1\n455        25.22         24.91          171.50     1878.0          0.10630           0.26650         0.33390             0.184500         0.1829  ...           211.70      2562.0            0.1573            0.60760          0.64760               0.28670          0.2355                  0.10510       0\n\n[456 rows x 31 columns]\n\n\n>>> atom.balance(strategy=\"smote\", verbose=2)\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n\n>>> # Note that the number of rows has increased\n>>> print(atom.train)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      13.480000     20.820000       88.400000   559.200000         0.101600          0.125500        0.106300             0.054390       0.172000  ...       107.300000   740.400000          0.161000           0.422500         0.503000              0.225800        0.280700                 0.107100       0\n1      18.310000     20.580000      120.800000  1052.000000         0.106800          0.124800        0.156900             0.094510       0.186000  ...       142.200000  1493.000000          0.149200           0.253600         0.375900              0.151000        0.307400                 0.078630       0\n2      17.930000     24.480000      115.200000   998.900000         0.088550          0.070270        0.056990             0.047440       0.153800  ...       135.100000  1320.000000          0.131500           0.180600         0.208000              0.113600        0.250400                 0.079480       0\n3      15.130000     29.810000       96.710000   719.500000         0.083200          0.046050        0.046860             0.027390       0.185200  ...       110.100000   931.400000          0.114800           0.098660         0.154700              0.065750        0.323300                 0.061650       0\n4       8.950000     15.760000       58.740000   245.200000         0.094620          0.124300        0.092630             0.023080       0.130500  ...        63.340000   270.000000          0.117900           0.187900         0.154400              0.038460        0.165200                 0.077220       1\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...              ...          ...               ...                ...              ...                   ...             ...                      ...     ...\n567    15.182945     22.486774       98.949465   711.386079         0.092513          0.102732        0.113923             0.069481       0.179224  ...       107.689157   826.276172          0.126730           0.199259         0.295172              0.142325        0.265352                 0.068318       0\n568    19.990378     20.622944      130.491182  1253.735467         0.091583          0.117753        0.117236             0.082771       0.202428  ...       167.456689  1995.896044          0.132457           0.289652         0.332006              0.182989        0.299088                 0.084150       0\n569    18.158121     18.928220      119.907435  1027.331092         0.113149          0.147089        0.171862             0.103942       0.209306  ...       135.286302  1319.270051          0.127029           0.233493         0.260138              0.133851        0.302406                 0.079535       0\n570    23.733233     26.433751      158.185672  1724.145541         0.098008          0.193789        0.231158             0.139527       0.188817  ...       207.483796  2844.559632          0.150495           0.463361         0.599077              0.266433        0.290828                 0.091542       0\n571    17.669575     16.375717      115.468589   968.552411         0.093636          0.109983        0.101005             0.075283       0.174505  ...       133.767576  1227.195245          0.118221           0.264624         0.249798              0.135098        0.268044                 0.076533       0\n\n[572 rows x 31 columns]\n
>>> from atom.data_cleaning import Balancer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[569 rows x 30 columns]\n\n\n>>> balancer = Balancer(strategy=\"smote\", verbose=2)\n>>> X, y = balancer.fit_transform(X, y)\n\nOversampling with SMOTE...\n --> Adding 145 samples to class 0.\n\n\n>>> # Note that the number of rows has increased\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0      17.990000     10.380000      122.800000  1001.000000         0.118400          0.277600        0.300100             0.147100       0.241900  ...      17.330000       184.600000  2019.000000          0.162200           0.665600         0.711900              0.265400        0.460100                 0.118900\n1      20.570000     17.770000      132.900000  1326.000000         0.084740          0.078640        0.086900             0.070170       0.181200  ...      23.410000       158.800000  1956.000000          0.123800           0.186600         0.241600              0.186000        0.275000                 0.089020\n2      19.690000     21.250000      130.000000  1203.000000         0.109600          0.159900        0.197400             0.127900       0.206900  ...      25.530000       152.500000  1709.000000          0.144400           0.424500         0.450400              0.243000        0.361300                 0.087580\n3      11.420000     20.380000       77.580000   386.100000         0.142500          0.283900        0.241400             0.105200       0.259700  ...      26.500000        98.870000   567.700000          0.209800           0.866300         0.686900              0.257500        0.663800                 0.173000\n4      20.290000     14.340000      135.100000  1297.000000         0.100300          0.132800        0.198000             0.104300       0.180900  ...      16.670000       152.200000  1575.000000          0.137400           0.205000         0.400000              0.162500        0.236400                 0.076780\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...            ...              ...          ...               ...                ...              ...                   ...             ...                      ...\n709    19.478557     23.348123      128.995257  1164.950583         0.101810          0.143231        0.194792             0.095794       0.198376  ...      30.482866       143.381227  1362.533650          0.135197           0.267786         0.365230              0.170069        0.273984                 0.076077\n710    18.752895     20.824323      124.472875  1084.317645         0.096491          0.171270        0.177021             0.095356       0.204866  ...      27.544127       160.451305  1623.116663          0.133721           0.506298         0.521417              0.203921        0.348906                 0.098688\n711    17.182368     21.204540      112.271609   925.918840         0.100517          0.110961        0.110803             0.076692       0.204604  ...      28.119577       142.316398  1439.815962          0.155602           0.277795         0.388351              0.207039        0.334574                 0.080310\n712    18.285452     20.578363      120.603613  1048.317740         0.106252          0.125135        0.153635             0.093128       0.188095  ...      26.188544       142.298194  1487.517523          0.147703           0.251890         0.365958              0.150828        0.308848                 0.078435\n713    14.550791     25.918705       96.913441   655.023273         0.111607          0.166865        0.158127             0.077468       0.228924  ...      36.072516       123.641397   930.709825          0.163673           0.659480         0.662486              0.197880        0.423041                 0.132320\n\n[714 rows x 30 columns]\n
"}, {"location": "API/data_cleaning/balancer/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBalance the data.

method fit(X, y=-1)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, dict or sequence, default=-1 Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=-1)[source]Balance the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str or sequence, default=-1 Target column corresponding to `X`.

  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • Else: Array with shape=(n_samples,) to use as target.

Returnsdataframe Balanced dataframe.

series Transformed target column.

"}, {"location": "API/data_cleaning/cleaner/", "title": "Cleaner", "text": "

class atom.data_cleaning.Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None)[source]Applies standard data cleaning steps on a dataset.

Use the parameters to choose which transformations to perform. The available steps are:

  • Convert dtypes to the best possible types.
  • Drop columns with specific data types.
  • Remove characters from column names.
  • Strip categorical features from spaces.
  • Drop duplicate rows.
  • Drop rows with missing values in the target column.
  • Encode the target column.

This class can be accessed from atom through the clean method. Read more in the user guide.

Parametersconvert_dtypes: bool, default=True Convert the column's data types to the best possible types that support pd.NA.

drop_dtypes: str, sequence or None, default=None Columns with these data types are dropped from the dataset.

drop_chars: str or None, default=None Remove the specified regex pattern from column names, e.g. [^A-Za-z0-9]+ to remove all non-alphanumerical characters.

strip_categorical: bool, default=True Whether to strip spaces from categorical columns.

drop_duplicates: bool, default=False Whether to drop duplicate rows. Only the first occurrence of every duplicated row is kept.

drop_missing_target: bool, default=True Whether to drop rows with missing values in the target column. This transformation is ignored if y is not provided.

encode_target: bool, default=True Whether to encode the target column(s). This includes converting categorical columns to numerical, and binarizing multilabel columns. This transformation is ignored if y is not provided.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

mapping_: dict Target values mapped to their respective encoded integers. Only available if encode_target=True.

feature_names_in_: np.ndarray Names of features seen during fit.

target_names_in_: np.ndarray Names of the target column(s) seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Encoder Perform encoding of categorical features.

Discretizer Bin continuous data into intervals.

Scaler Scale the data.

"}, {"location": "API/data_cleaning/cleaner/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> y = [\"a\" if i else \"b\" for i in y]\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.y)\n\n0      a\n1      a\n2      a\n3      a\n4      a\n      ..\n564    a\n565    a\n566    a\n567    a\n568    b\nName: target, Length: 569, dtype: object\n\n\n>>> atom.clean(verbose=2)\n\nFitting Cleaner...\nCleaning the data...\n --> Label-encoding column target.\n\n\n>>> print(atom.y)\n\n0      0\n1      0\n2      0\n3      0\n4      0\n      ..\n564    0\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: Int64\n
>>> from atom.data_cleaning import Cleaner\n>>> from numpy.random import randint\n\n>>> y = [\"a\" if i else \"b\" for i in range(randint(100))]\n\n>>> cleaner = Cleaner(verbose=2)\n>>> y = cleaner.fit_transform(y=y)\n\nFitting Cleaner...\nCleaning the data...\n --> Label-encoding column target.\n\n\n>>> print(y)\n\n0     1\n1     0\n2     0\n3     0\n4     0\n5     0\n6     0\n7     0\n8     0\n9     0\n10    0\n11    0\n12    0\n13    0\n14    0\n15    0\n16    0\n17    0\n18    0\n19    0\n20    0\n21    0\n22    0\n23    0\n24    0\n25    0\n26    0\n27    0\n28    0\n29    0\n30    0\n31    0\n32    0\n33    0\n34    0\n35    0\n36    0\nName: target, dtype: Int64\n
"}, {"location": "API/data_cleaning/cleaner/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformInversely transform the label encoding.set_paramsSet the parameters of this estimator.transformApply the data cleaning steps to the data.

method fit(X=None, y=None)[source]Fit to data.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Inversely transform the label encoding.

This method only inversely transforms the target encoding. The rest of the transformations can't be inverted. If encode_target=False, the data is returned as is.

ParametersX: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsdataframe Unchanged feature set. Only returned if provided.

series or dataframe Original target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X=None, y=None)[source]Apply the data cleaning steps to the data.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsdataframe Transformed feature set. Only returned if provided.

series Transformed target column. Only returned if provided.

"}, {"location": "API/data_cleaning/discretizer/", "title": "Discretizer", "text": "

class atom.data_cleaning.Discretizer(strategy=\"quantile\", bins=5, labels=None, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Bin continuous data into intervals.

For each feature, the bin edges are computed during fit and, together with the number of bins, they define the intervals. Ignores categorical columns.

This class can be accessed from atom through the discretize method. Read more in the user guide.

Tip

The transformation returns categorical columns. Use the Encoder class to convert them back to numerical types.

Parametersstrategy: str, default=\"quantile\" Strategy used to define the widths of the bins. Choose from:

  • \"uniform\": All bins have identical widths.
  • \"quantile\": All bins have the same number of points.
  • \"kmeans\": Values in each bin have the same nearest center of a 1D k-means cluster.
  • \"custom\": Use custom bin edges provided through bins.

bins: int, sequence or dict, default=5 Bin number or bin edges in which to split every column.

  • If int: Number of bins to produce for all columns. Only for strategy!=\"custom\".
  • If sequence:

    • For strategy!=\"custom\": Number of bins per column. The n-th value corresponds to the n-th column that is transformed. Categorical columns are ignored.
    • For strategy=\"custom\": Bin edges with length=n_bins - 1. The outermost edges are always -inf and +inf, e.g., bins [1, 2] indicate (-inf, 1], (1, 2], (2, inf].
  • If dict: One of the aforementioned options per column, where the key is the column's name. Columns that are not in the dictionary are not transformed.

labels: sequence, dict or None, default=None Label names with which to replace the binned intervals.

  • If None: Use default labels of the form (min_edge, max_edge].
  • If sequence: Labels to use for all columns.
  • If dict: Labels per column, where the key is the column's name. Columns that are not in the dictionary use the default labels.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random. Only for strategy=\"quantile\".

Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Encoder Perform encoding of categorical features.

Imputer Handle missing values in the data.

Normalizer Transform the data to follow a Normal/Gaussian distribution.

"}, {"location": "API/data_cleaning/discretizer/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom[\"mean radius\"])\n\n0      13.48\n1      18.31\n2      17.93\n3      15.13\n4       8.95\n       ...  \n564    14.34\n565    13.17\n566    17.30\n567    17.68\n568    14.80\nName: mean radius, Length: 569, dtype: float64\n\n\n>>> atom.discretize(\n...     strategy=\"custom\",\n...     bins=[13, 18],\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n...     columns=\"mean radius\",\n... )\n\nFitting Discretizer...\nBinning the features...\n --> Discretizing feature mean radius in 3 bins.\n\n\n>>> print(atom[\"mean radius\"])\n\n0      medium\n1       large\n2      medium\n3      medium\n4       small\n        ...  \n564    medium\n565    medium\n566    medium\n567    medium\n568    medium\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' < 'medium' < 'large']\n
>>> from atom.data_cleaning import Discretizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> print(X[\"mean radius\"])\n\n0      17.99\n1      20.57\n2      19.69\n3      11.42\n4      20.29\n       ...  \n564    21.56\n565    20.13\n566    16.60\n567    20.60\n568     7.76\nName: mean radius, Length: 569, dtype: float64\n\n\n>>> discretizer = Discretizer(\n...     strategy=\"custom\",\n...     bins={\"mean radius\": [13, 18]},\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n... )\n>>> X = discretizer.fit_transform(X)\n\nFitting Discretizer...\nBinning the features...\n --> Discretizing feature mean radius in 3 bins.\n\n\n>>> print(X[\"mean radius\"])\n\n0      medium\n1       large\n2       large\n3       small\n4       large\n        ...  \n564     large\n565     large\n566    medium\n567     large\n568     small\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' < 'medium' < 'large']\n
"}, {"location": "API/data_cleaning/discretizer/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBin the data into intervals.

method fit(X, y=None)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Bin the data into intervals.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Transformed feature set.

"}, {"location": "API/data_cleaning/encoder/", "title": "Encoder", "text": "

class atom.data_cleaning.Encoder(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"infrequent\", n_jobs=1, verbose=0, logger=None, **kwargs)[source]Perform encoding of categorical features.

The encoding type depends on the number of classes in the column:

  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
  • If n_classes > max_onehot, use strategy-encoding.

Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Infrequent classes can be replaced with a value in order to prevent too high cardinality.

This class can be accessed from atom through the encode method. Read more in the user guide.

Warning

Three category-encoders estimators are unavailable:

  • OneHotEncoder: Use the max_onehot parameter.
  • HashingEncoder: Incompatibility of APIs.
  • LeaveOneOutEncoder: Incompatibility of APIs.

Parametersstrategy: str or estimator, default=\"Target\" Type of encoding to use for high cardinality features. Choose from any of the estimators in the category-encoders package or provide a custom one.

max_onehot: int or None, default=10 Maximum number of unique values in a feature to perform one-hot encoding. If None, strategy-encoding is always used for columns with more than two classes.

ordinal: dict or None, default=None Order of ordinal features, where the dict key is the feature's name and the value is the class order, e.g., {\"salary\": [\"low\", \"medium\", \"high\"]}.

infrequent_to_value: int, float or None, default=None Replaces infrequent class occurrences in categorical columns with the string in parameter value. This transformation is done before the encoding of the column.

  • If None: Skip this step.
  • If int: Minimum number of occurrences in a class.
  • If float: Minimum fraction of occurrences in a class.

value: str, default=\"infrequent\" Value with which to replace rare classes. This parameter is ignored if infrequent_to_value=None.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 - value.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

**kwargs Additional keyword arguments for the strategy estimator.

Attributesmapping_: dict of dicts Encoded values and their respective mapping. The column name is the key to its mapping dictionary. Only for ordinal encoding.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Cleaner Applies standard data cleaning steps on a dataset.

Imputer Handle missing values in the data.

Pruner Prune outliers from the data.

"}, {"location": "API/data_cleaning/encoder/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from numpy.random import randint\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n>>> X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n>>> X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710             x0             x1            x17\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863             x0             x0            x15\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948             x1             x0            x16\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165             x0             x0            x13\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722             x0             x1            x11\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072             x0             x2            x11\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618             x1             x1             x5\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113             x0             x1            x17\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738             x0             x0             x2\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285             x0             x2            x14\n\n[569 rows x 33 columns]\n\n\n>>> atom.encode(strategy=\"target\", max_onehot=10, verbose=2)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --> OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --> Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n>>> # Note the one-hot encoded column with name [feature]_[class]\n>>> print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x1  cat_feature_2_x0  cat_feature_2_x2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           0.5030               0.22580          0.2807                  0.10710            0.0               1.0               0.0               0.0       0.622917\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           0.3759               0.15100          0.3074                  0.07863            0.0               0.0               1.0               0.0       0.619953\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           0.2080               0.11360          0.2504                  0.07948            1.0               0.0               1.0               0.0       0.636924\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           0.1547               0.06575          0.3233                  0.06165            0.0               0.0               1.0               0.0       0.585368\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...           0.1544               0.03846          0.1652                  0.07722            0.0               1.0               0.0               0.0       0.638596\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           0.1632               0.10870          0.3062                  0.06072            0.0               0.0               0.0               1.0       0.638596\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           0.3728               0.16070          0.3693                  0.09618            1.0               1.0               0.0               0.0       0.588596\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           0.3378               0.18570          0.3138                  0.08113            0.0               1.0               0.0               0.0       0.622917\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           0.3583               0.15150          0.2463                  0.07738            0.0               0.0               1.0               0.0       0.688596\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           0.2060               0.08308          0.3600                  0.07285            0.0               0.0               0.0               1.0       0.662643\n\n[569 rows x 35 columns]\n
>>> from atom.data_cleaning import Encoder\n>>> from sklearn.datasets import load_breast_cancer\n>>> from numpy.random import randint\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n>>> X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n>>> X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890             x1             x2             x5\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902             x1             x2            x13\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758             x0             x0            x15\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300             x0             x2            x10\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678             x1             x1            x17\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115             x1             x1            x12\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637             x0             x2            x14\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820             x0             x1             x3\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400             x1             x0             x2\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039             x1             x1            x11\n\n[569 rows x 33 columns]\n\n\n>>> encoder = Encoder(strategy=\"target\", max_onehot=10, verbose=2)\n>>> X = encoder.fit_transform(X, y)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --> OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --> Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n>>> # Note the one-hot encoded column with name [feature]_[class]\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x2  cat_feature_2_x0  cat_feature_2_x1  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.7119                0.2654          0.4601                  0.11890            1.0               1.0               0.0               0.0       0.645086\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.2416                0.1860          0.2750                  0.08902            1.0               1.0               0.0               0.0       0.604148\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.4504                0.2430          0.3613                  0.08758            0.0               0.0               1.0               0.0       0.675079\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.6869                0.2575          0.6638                  0.17300            0.0               1.0               0.0               0.0       0.706297\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.4000                0.1625          0.2364                  0.07678            1.0               0.0               0.0               1.0       0.716566\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.4107                0.2216          0.2060                  0.07115            1.0               0.0               0.0               1.0       0.598024\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.3215                0.1628          0.2572                  0.06637            0.0               1.0               0.0               0.0       0.683185\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.3403                0.1418          0.2218                  0.07820            0.0               0.0               0.0               1.0       0.472908\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.9387                0.2650          0.4087                  0.12400            1.0               0.0               1.0               0.0       0.585452\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.0000                0.0000          0.2871                  0.07039            1.0               0.0               0.0               1.0       0.516759\n\n[569 rows x 35 columns]\n
"}, {"location": "API/data_cleaning/encoder/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformEncode the data.

method fit(X, y=None)[source]Fit to data.

Note that leaving y=None can lead to errors if the strategy encoder requires target values. For multioutput tasks, only the first target column is used to fit the encoder.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, dict, sequence or dataframe-like Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Encode the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Encoded dataframe.

"}, {"location": "API/data_cleaning/imputer/", "title": "Imputer", "text": "

class atom.data_cleaning.Imputer(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Handle missing values in the data.

Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

This class can be accessed from atom through the impute method. Read more in the user guide.

Parametersstrat_num: str, int or float, default=\"drop\" Imputing strategy for numerical columns. Choose from:

  • \"drop\": Drop rows containing missing values.
  • \"mean\": Impute with mean of column.
  • \"median\": Impute with median of column.
  • \"knn\": Impute using a K-Nearest Neighbors approach.
  • \"iterative\": Impute using a multivariate imputer.
  • \"most_frequent\": Impute with the most frequent value.
  • int or float: Impute with provided numerical value.

strat_cat: str, default=\"drop\" Imputing strategy for categorical columns. Choose from:

  • \"drop\": Drop rows containing missing values.
  • \"most_frequent\": Impute with the most frequent value.
  • str: Impute with provided string.

max_nan_rows: int, float or None, default=None Maximum number or fraction of missing values in a row (if more, the row is removed). If None, ignore this step.

max_nan_cols: int, float or None, default=None Maximum number or fraction of missing values in a column (if more, the column is removed). If None, ignore this step.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 - value.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random. Only used when strat_num=\"iterative\".

Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Balancer Balance the number of samples per class in the target column.

Discretizer Bin continuous data into intervals.

Encoder Perform encoding of categorical features.

"}, {"location": "API/data_cleaning/imputer/#example", "title": "Example", "text": "atomstand-alone
>>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from numpy.random import randint\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add some random missing values to the data\n>>> for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iat[i, j] = np.NaN\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.nans)\n\nmean radius                130\nmean texture               141\nmean perimeter             124\nmean area                  136\nmean smoothness              0\nmean compactness             0\nmean concavity               0\nmean concave points          0\nmean symmetry                0\nmean fractal dimension       0\nradius error                 0\ntexture error                0\nperimeter error              0\narea error                   0\nsmoothness error             0\ncompactness error            0\nconcavity error              0\nconcave points error         0\nsymmetry error               0\nfractal dimension error      0\nworst radius                 0\nworst texture                0\nworst perimeter              0\nworst area                   0\nworst smoothness             0\nworst compactness            0\nworst concavity              0\nworst concave points         0\nworst symmetry               0\nworst fractal dimension      0\ndtype: int64\n\n\n>>> atom.impute(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n\nFitting Imputer...\nImputing missing values...\n --> Imputing 130 missing values with median (13.27) in feature mean radius.\n --> Imputing 141 missing values with median (18.87) in feature mean texture.\n --> Imputing 124 missing values with median (85.66) in feature mean perimeter.\n --> Imputing 136 missing values with median (555.1) in feature mean area.\n\n\n>>> print(atom.n_nans)\n\n0\n
>>> import numpy as np\n>>> from atom.data_cleaning import Imputer\n>>> from numpy.random import randint\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add some random missing values to the data\n>>> for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iloc[i, j] = np.nan\n\n>>> imputer = Imputer(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n>>> X, y = imputer.fit_transform(X, y)\n\nFitting Imputer...\nImputing missing values...\n --> Dropping 2 samples for containing more than 3 missing values.\n --> Imputing 124 missing values with median (13.38) in feature mean radius.\n --> Imputing 127 missing values with median (18.87) in feature mean texture.\n --> Imputing 137 missing values with median (86.54) in feature mean perimeter.\n --> Imputing 134 missing values with median (561.3) in feature mean area.\n\n\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          13.38        10.380         122.800     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57        17.770          86.545      561.3          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69        21.250         130.000     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42        20.380          77.580      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          13.38        14.340         135.100     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56        22.390          86.545      561.3          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13        18.865         131.200     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        13.38        28.080          86.545      561.3          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60        29.330         140.100     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568        13.38        24.540          47.920      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[567 rows x 30 columns]\n
"}, {"location": "API/data_cleaning/imputer/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformImpute the missing values.

method fit(X, y=None)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Impute the missing values.

Note that leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsdataframe Imputed dataframe.

series Transformed target column. Only returned if provided.

"}, {"location": "API/data_cleaning/normalizer/", "title": "Normalizer", "text": "

class atom.data_cleaning.Normalizer(strategy=\"yeojohnson\", device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None, **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Categorical columns are ignored.

This class can be accessed from atom through the normalize method. Read more in the user guide.

Warning

The quantile strategy performs a non-linear transformation. This may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.

Note

The yeojohnson and boxcox strategies scale the data after transforming. Use the kwargs to change this behavior.

Parametersstrategy: str, default=\"yeojohnson\" The transforming strategy. Choose from:

  • \"yeojohnson\"
  • \"boxcox\" (only works with strictly positive values)
  • \"quantile\": Transform features using quantiles information.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the quantile strategy. If None, the random number generator is the RandomState used by np.random.

**kwargs Additional keyword arguments for the strategy estimator.

Attributes[strategy]_: sklearn transformer Object with which the data is transformed, e.g., normalizer.yeojohnson for the default strategy.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Cleaner Applies standard data cleaning steps on a dataset.

Pruner Prune outliers from the data.

Scaler Scale the data.

"}, {"location": "API/data_cleaning/normalizer/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
>>> atom.normalize(verbose=2)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.017068      0.464087        0.031104  -0.020222         0.390628          0.620790        0.562136             0.426774      -0.280554  ...         0.251532    0.081524          1.224389           1.206519         1.189835              1.522769       -0.043007                 1.378960       0\n1       1.182066      0.411242        1.183030   1.200556         0.741209          0.608244        1.100342             1.256472       0.256014  ...         1.119375    1.218096          0.759546           0.244492         0.726989              0.650523        0.424017                -0.164104       0\n2       1.105309      1.197684        1.018344   1.106437        -0.552214         -0.652544       -0.230044             0.226950      -1.050816  ...         0.973194    1.037232          0.002307          -0.374986        -0.128679              0.107299       -0.647198                -0.100126       0\n3       0.455144      2.077941        0.379512   0.486019        -0.966587         -1.447057       -0.438308            -0.480189       0.226570  ...         0.337722    0.483003         -0.785100          -1.301043        -0.483292             -0.722786        0.676588                -1.783846       0\n4      -1.898537     -0.815757       -1.745528  -1.873415        -0.102067          0.599235        0.374346            -0.662103      -2.173761  ...        -1.869111   -2.095123         -0.633206          -0.305478        -0.485431             -1.278472       -2.898859                -0.273347       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.238929     -1.546154        0.209113   0.257899         0.214334         -0.482480       -0.225132             0.183841       0.996371  ...         0.346743    0.373205         -0.079012          -0.660736        -0.423384              0.029761        0.404215                -1.894769       1\n565    -0.115233      0.675396       -0.105672  -0.125511         0.078814          0.213069        0.222118             0.375009      -0.177404  ...         0.194134    0.082260          0.804177           1.061384         0.714032              0.778530        1.315113                 0.913117       0\n566     0.972621     -0.443853        0.950416   0.971288         0.335466          0.200161        0.804757             1.074782       0.080964  ...         0.880583    0.920102          0.443592           0.144776         0.561298              1.086695        0.527842                 0.020173       0\n567     1.053489      0.446545        1.084407   1.040647         1.046541          1.237987        1.321388             1.410770       0.650180  ...         0.925288    1.016604          0.452080           0.855688         0.652219              0.657243       -0.735710                -0.260751       0\n568     0.366875     -0.289945        0.346701   0.359700        -0.309357         -0.150999       -0.574459            -0.683107       0.375972  ...         0.207028    0.284140         -0.407994          -0.303600        -0.141124             -0.402554        1.196110                -0.638106       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
>>> from atom.data_cleaning import Normalizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> normalizer = Normalizer(verbose=2)\n>>> X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n
"}, {"location": "API/data_cleaning/normalizer/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.

method fit(X, y=None)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Original dataframe.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Apply the transformations to the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Normalized dataframe.

"}, {"location": "API/data_cleaning/pruner/", "title": "Pruner", "text": "

class atom.data_cleaning.Pruner(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Prune outliers from the data.

Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

This class can be accessed from atom through the prune method. Read more in the user guide.

Info

The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"dbscan\".

Parametersstrategy: str or sequence, default=\"zscore\" Strategy with which to select the outliers. If sequence of strategies, only samples marked as outliers by all chosen strategies are dropped. Choose from:

  • \"zscore\": Z-score of each data value.
  • \"iforest\": Isolation Forest.
  • \"ee\": Elliptic Envelope.
  • \"lof\": Local Outlier Factor.
  • \"svm\": One-class SVM.
  • \"dbscan\": Density-Based Spatial Clustering.
  • \"hdbscan\": Hierarchical Density-Based Spatial Clustering.
  • \"optics\": DBSCAN-like clustering approach.

method: int, float or str, default=\"drop\" Method to apply on the outliers. Only the zscore strategy accepts another method than \"drop\". Choose from:

  • \"drop\": Drop any sample with outlier values.
  • \"minmax\": Replace outlier with the min/max of the column.
  • Any numerical value with which to replace the outliers.

max_sigma: int or float, default=3 Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. Only if strategy=\"zscore\".

include_target: bool, default=False Whether to include the target column in the search for outliers. This can be useful for regression tasks. Only if strategy=\"zscore\".

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"sklearnex\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

**kwargs Additional keyword arguments for the strategy estimator. If sequence of strategies, the params should be provided in a dict with the strategy's name as key.

Attributes[strategy]_: sklearn estimator Object used to prune the data, e.g., pruner.iforest for the isolation forest strategy. Not available for strategy=\"zscore\".

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Balancer Balance the number of samples per class in the target column.

Normalizer Transform the data to follow a Normal/Gaussian distribution.

Scaler Scale the data.

"}, {"location": "API/data_cleaning/pruner/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.prune(stratgey=\"iforest\", verbose=2)\n\nFitting Pruner...\nPruning outliers...\n --> Dropping 63 outliers.\n\n\n>>> # Note the reduced number of rows\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4          10.26         16.58           65.85      320.8          0.08877           0.08066         0.04358              0.02438         0.1669  ...            71.08       357.4            0.1461            0.22460           0.1783               0.08333          0.2691                  0.09479       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n501        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n502        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n503        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n504        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n505        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[506 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
>>> from atom.data_cleaning import Normalizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> normalizer = Normalizer(verbose=2)\n>>> X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> # Note the reduced number of rows\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n
"}, {"location": "API/data_cleaning/pruner/#methods", "title": "Methods", "text": "

fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the outlier strategy on the data.

method fit(X=None, y=None, **fit_params)[source]Do nothing.

Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsself Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Apply the outlier strategy on the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe: Target columns for multioutput tasks.

Returnsdataframe Transformed feature set.

series Transformed target column. Only returned if provided.

"}, {"location": "API/data_cleaning/scaler/", "title": "Scaler", "text": "

class atom.data_cleaning.Scaler(strategy=\"standard\", include_binary=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Scale the data.

Apply one of sklearn's scalers. Categorical columns are ignored.

This class can be accessed from atom through the scale method. Read more in the user guide.

Parametersstrategy: str, default=\"standard\" Strategy with which to scale the data. Choose from:

  • \"standard\": Remove mean and scale to unit variance.
  • \"minmax\": Scale features to a given range.
  • \"maxabs\": Scale features by their maximum absolute value.
  • \"robust\": Scale using statistics that are robust to outliers.

include_binary: bool, default=False Whether to scale binary columns (only 0s and 1s).

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"cuml\"

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

**kwargs Additional keyword arguments for the strategy estimator.

Attributes[strategy]_: sklearn transformer Object with which the data is scaled, e.g., scaler.standard for the default strategy.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

Balancer Balance the number of samples per class in the target column.

Normalizer Transform the data to follow a Normal/Gaussian distribution.

Scaler Scale the data.

"}, {"location": "API/data_cleaning/scaler/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.scale(verbose=2)\n\nFitting Scaler...\nScaling features...\n\n\n>>> # Note the reduced number of rows\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.181875      0.356669       -0.147122  -0.270991         0.340268          0.381628        0.214571             0.125567      -0.345050  ...         0.000933   -0.246244          1.240292           1.077359         1.116229              1.667157       -0.162964                 1.326816       0\n1       1.162216      0.300578        1.159704   1.097856         0.707625          0.368288        0.852572             1.148598       0.172744  ...         1.025723    1.042996          0.719898          -0.011475         0.500961              0.537309        0.280594                -0.308640       0\n2       1.056470      1.212060        0.933833   0.950360        -0.581659         -0.670877       -0.407166            -0.051653      -1.018183  ...         0.817241    0.746639         -0.060694          -0.482078        -0.311813             -0.027615       -0.666328                -0.259812       0\n3       0.277287      2.457753        0.188054   0.174273        -0.959614         -1.132432       -0.534892            -0.562913       0.143156  ...         0.083151    0.080948         -0.797185          -1.010314        -0.569828             -0.750385        0.544735                -1.284055       0\n4      -1.442482     -0.825921       -1.343434  -1.143186        -0.152840          0.358760        0.042209            -0.672815      -1.879941  ...        -1.289891   -1.052061         -0.660471          -0.435018        -0.571280             -1.162598       -2.081728                -0.389638       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.057446     -1.361124        0.018651  -0.043220         0.160827         -0.557108       -0.404013            -0.087607       0.967929  ...         0.091960   -0.018751         -0.140077          -0.663228        -0.528681             -0.101629        0.260659                -1.337478       1\n565    -0.268141      0.588045       -0.267318  -0.347933         0.025188         -0.014753       -0.084382             0.077883      -0.248889  ...        -0.051921   -0.245730          0.768409           0.870422         0.485954              0.683827        1.308918                 0.699518       0\n566     0.881154     -0.517419        0.845098   0.753978         0.283751         -0.026187        0.470528             0.868616      -0.001087  ...         0.693914    0.578760          0.384728          -0.095926         0.316526              1.061450        0.386915                -0.165028       0\n567     0.986900      0.337972        1.022568   0.852586         1.039660          1.162956        1.213182             1.426285       0.583281  ...         0.752641    0.715804          0.393548           0.608690         0.415763              0.544861       -0.734440                -0.380446       0\n568     0.185455     -0.381865        0.154577   0.050111        -0.352767         -0.315850       -0.612688            -0.685055       0.294796  ...        -0.040176   -0.093611         -0.453195          -0.433728        -0.321494             -0.488617        1.154420                -0.640672       1\n\n[569 rows x 31 columns]\n
>>> from atom.data_cleaning import Scaler\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> scaler = Scaler(verbose=2)\n>>> X = scaler.fit_transform(X)\n\nFitting Scaler...\nScaling features...\n\n\n>>> # Note the reduced number of rows\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.097064     -2.073335        1.269934   0.984375         1.568466          3.283515        2.652874             2.532475       2.217515  ...      -1.359293         2.303601    2.001237          1.307686           2.616665         2.109526              2.296076        2.750622                 1.937015\n1       1.829821     -0.353632        1.685955   1.908708        -0.826962         -0.487072       -0.023846             0.548144       0.001392  ...      -0.369203         1.535126    1.890489         -0.375612          -0.430444        -0.146749              1.087084       -0.243890                 0.281190\n2       1.579888      0.456187        1.566503   1.558884         0.942210          1.052926        1.363478             2.037231       0.939685  ...      -0.023974         1.347475    1.456285          0.527407           1.082932         0.854974              1.955000        1.152255                 0.201391\n3      -0.768909      0.253732       -0.592687  -0.764464         3.283553          3.402909        1.915897             1.451707       2.867383  ...       0.133984        -0.249939   -0.550021          3.394275           3.893397         1.989588              2.175786        6.046041                 4.935010\n4       1.750297     -1.151816        1.776573   1.826229         0.280372          0.539340        1.371011             1.428493      -0.009560  ...      -1.466770         1.338539    1.220724          0.220556          -0.313395         0.613179              0.729259       -0.868353                -0.397100\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     2.110995      0.721473        2.060786   2.343856         1.041842          0.219060        1.947285             2.320965      -0.312589  ...       0.117700         1.752563    2.015301          0.378365          -0.273318         0.664512              1.629151       -1.360158                -0.709091\n565     1.704854      2.085134        1.615931   1.723842         0.102458         -0.017833        0.693043             1.263669      -0.217664  ...       2.047399         1.421940    1.494959         -0.691230          -0.394820         0.236573              0.733827       -0.531855                -0.973978\n566     0.702284      2.045574        0.672676   0.577953        -0.840484         -0.038680        0.046588             0.105777      -0.809117  ...       1.374854         0.579001    0.427906         -0.809587           0.350735         0.326767              0.414069       -1.104549                -0.318409\n567     1.838341      2.336457        1.982524   1.735218         1.525767          3.272144        3.296944             2.658866       2.137194  ...       2.237926         2.303601    1.653171          1.430427           3.904848         3.197605              2.289985        1.919083                 2.219635\n568    -1.808401      1.221792       -1.814389  -1.347789        -3.112085         -1.150752       -1.114873            -1.261820      -0.820070  ...       0.764190        -1.432735   -1.075813         -1.859019          -1.207552        -1.305831             -1.745063       -0.048138                -0.751207\n\n[569 rows x 30 columns]\n
"}, {"location": "API/data_cleaning/scaler/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformPerform standardization by centering and scaling.

method fit(X, y=None)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

ReturnsSelf Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Scaled dataframe.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Perform standardization by centering and scaling.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Scaled dataframe.

"}, {"location": "API/feature_engineering/featureextractor/", "title": "FeatureExtractor", "text": "

class atom.feature_engineering.FeatureExtractor(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, verbose=0, logger=None)[source]Extract features from datetime columns.

Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

This class can be accessed from atom through the feature_extraction method. Read more in the user guide.

Warning

Decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos features should be considered one single coordinate system.

Parametersfeatures: str or sequence, default=(\"day\", \"month\", \"year\") Features to create from the datetime columns. Note that created features with zero variance (e.g., the feature hour in a column that only contains dates) are ignored. Allowed values are datetime attributes from pandas.Series.dt.

fmt: str, sequence or None, default=None Format (strptime) of the categorical columns that need to be converted to datetime. If sequence, the n-th format corresponds to the n-th categorical column that can be successfully converted. If None, the format is inferred automatically from the first non NaN value. Values that cannot be converted are returned as NaT.

encoding_type: str, default=\"ordinal\" Type of encoding to use. Choose from:

  • \"ordinal\": Encode features in increasing order.
  • \"cyclic\": Encode features using sine and cosine to capture their cyclic nature. This approach creates two columns for every feature. Non-cyclic features still use ordinal encoding.

drop_columns: bool, default=True Whether to drop the original columns after transformation.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

FeatureGenerator Generate new features.

FeatureGrouper Extract statistics from similar features.

FeatureSelector Reduce the number of features in the data.

"}, {"location": "API/feature_engineering/featureextractor/#example", "title": "Example", "text": "atomstand-alone
>>> import pandas as pd\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a datetime column\n>>> X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_extraction(features=[\"day\"], fmt=\"%d/%m/%Y\", verbose=2)\n\nFitting FeatureExtractor...\nExtracting datetime features...\n --> Extracting features from column date.\n   --> Creating feature date_day.\n\n\n>>> # Note the date_day column\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day  target\n0         12.770         22.47           81.72      506.3          0.09055           0.05761         0.04711              0.02704         0.1585  ...       653.6            0.1419             0.1523           0.2177               0.09331          0.2829                  0.08067        16       0\n1         27.420         26.27          186.90     2501.0          0.10840           0.19880         0.36350              0.16890         0.2061  ...      4254.0            0.1357             0.4256           0.6833               0.26250          0.2641                  0.07427         7       0\n2         15.850         23.95          103.70      782.7          0.08401           0.10020         0.09938              0.05364         0.1847  ...       876.5            0.1131             0.1924           0.2322               0.11190          0.2809                  0.06287        14       0\n3         14.190         23.81           92.87      610.7          0.09463           0.13060         0.11150              0.06462         0.2235  ...       811.3            0.1559             0.4059           0.3744               0.17720          0.4724                  0.10260         3       0\n4          8.950         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...       270.0            0.1179             0.1879           0.1544               0.03846          0.1652                  0.07722        27       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...         ...               ...                ...              ...                   ...             ...                      ...       ...     ...\n564       10.800         21.98           68.79      359.9          0.08801           0.05743         0.03614              0.01404         0.2016  ...       489.5            0.1303             0.1696           0.1927               0.07485          0.2965                  0.07662         4       1\n565       11.930         10.91           76.14      442.7          0.08872           0.05242         0.02606              0.01796         0.1601  ...       589.5            0.1374             0.1575           0.1514               0.06876          0.2460                  0.07262         6       1\n566       24.630         21.60          165.50     1841.0          0.10300           0.21060         0.23100              0.14710         0.1991  ...      2642.0            0.1342             0.4188           0.4658               0.24750          0.3157                  0.09671         6       0\n567        6.981         13.43           43.79      143.5          0.11700           0.07568         0.00000              0.00000         0.1930  ...       185.2            0.1584             0.1202           0.0000               0.00000          0.2932                  0.09382        12       1\n568       15.050         19.07           97.26      701.9          0.09215           0.08597         0.07486              0.04335         0.1561  ...       967.0            0.1246             0.2101           0.2866               0.11200          0.2282                  0.06954        30       0\n\n[569 rows x 32 columns]\n
>>> import pandas as pd\n>>> from atom.feature_engineering import FeatureExtractor\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a datetime column\n>>> X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n>>> fe = FeatureExtractor(features=[\"day\"], fmt=\"%Y-%m-%d\", verbose=2)\n>>> X = fe.transform(X)\n\nExtracting datetime features...\n --> Extracting features from column date.\n   --> Creating feature date_day.\n\n\n>>> # Note the date_day column\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890         1\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902         2\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758         3\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300         4\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678         5\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...       ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115        19\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637        20\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820        21\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400        22\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039        23\n\n[569 rows x 31 columns]\n
"}, {"location": "API/feature_engineering/featureextractor/#methods", "title": "Methods", "text": "

fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformExtract the new features.

method fit(X=None, y=None, **fit_params)[source]Do nothing.

Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsself Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Extract the new features.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Transformed feature set.

"}, {"location": "API/feature_engineering/featuregenerator/", "title": "FeatureGenerator", "text": "

class atom.feature_engineering.FeatureGenerator(strategy=\"dfs\", n_features=None, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Generate new features.

Create new combinations of existing features to capture the non-linear relations between the original features.

This class can be accessed from atom through the feature_generation method. Read more in the user guide.

Warning

  • Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom's nans attribute.
  • When using dfs with n_jobs>1, make sure to protect your code with if __name__ == \"__main__\". Featuretools uses dask, which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task.
  • gfg can be slow for very large populations.

Tip

dfs can create many new features and not all of them will be useful. Use the FeatureSelector class to reduce the number of features.

Parametersstrategy: str, default=\"dfs\" Strategy to crate new features. Choose from:

  • \"dfs\": Deep Feature Synthesis.
  • \"gfg\": Genetic Feature Generation.

n_features: int or None, default=None Maximum number of newly generated features to add to the dataset. If None, select all created features.

operators: str, sequence or None, default=None Mathematical operators to apply on the features. None to use all. Choose from: add, sub, mul, div, abs, sqrt, log, inv, sin, cos, tan.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 + n_jobs.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

**kwargs Additional keyword arguments for the SymbolicTransformer instance. Only for the gfg strategy.

Attributesgfg_: SymbolicTransformer Object used to calculate the genetic features. Only available when strategy=\"gfg\".

genetic_features_: pd.DataFrame Information on the newly created non-linear features. Only available when strategy=\"gfg\". Columns include:

  • name: Name of the feature (generated automatically).
  • description: Operators used to create this feature.
  • fitness: Fitness score.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

FeatureExtractor Extract features from datetime columns.

FeatureGrouper Extract statistics from similar features.

FeatureSelector Reduce the number of features in the data.

"}, {"location": "API/feature_engineering/featuregenerator/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_generation(strategy=\"dfs\", n_features=5, verbose=2)\n\nFitting FeatureGenerator...\nGenerating new features...\n --> 5 new features were added.\n\n\n>>> # Note the texture error / worst symmetry column\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  mean concave points * smoothness error  mean concavity + worst radius  mean radius / smoothness error  worst concave points * worst radius  worst radius / concave points error  target\n0         13.280         13.72           85.79      541.8          0.08363           0.08575  ...                                0.000122                       14.29077                     3109.342074                             1.306235                          1681.624941       1\n1         15.460         11.89          102.50      736.9          0.12570           0.15550  ...                                0.000592                       18.99320                     2866.679028                             3.432933                          1423.484848       0\n2         13.110         15.56           87.21      530.2          0.13980           0.17650  ...                                0.000688                       16.51710                     1830.494275                             3.239166                          1175.072046       0\n3          9.847         15.68           63.00      293.2          0.09492           0.08419  ...                                0.000211                       11.26330                     1127.691251                             0.733747                          1652.698133       1\n4         14.870         20.21           96.12      680.9          0.09587           0.08345  ...                                0.000268                       16.07824                     2746.075716                             1.628217                          1353.338969       1\n..           ...           ...             ...        ...              ...               ...  ...                                     ...                            ...                             ...                                  ...                                  ...     ...\n564       14.470         24.99           95.81      656.4          0.08837           0.12300  ...                                0.000278                       16.32090                     2027.178481                             1.954510                          1395.869191       1\n565       19.690         21.25          130.00     1203.0          0.10960           0.15990  ...                                0.000787                       23.76740                     3201.626016                             5.727510                          1145.286686       0\n566       19.270         26.47          127.90     1162.0          0.09401           0.17190  ...                                0.000381                       24.31570                     3842.472582                             4.310775                          2504.407342       0\n567       11.760         18.14           75.00      431.1          0.09968           0.05914  ...                                0.000197                       13.38685                     2101.501072                             0.956576                           932.960894       0\n568       14.580         13.66           94.29      658.8          0.09832           0.08918  ...                                0.000215                       16.84222                     2943.670503                             1.539574                          1938.020352       1\n\n[569 rows x 36 columns]\n
>>> from atom.feature_engineering import FeatureGenerator\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fg = FeatureGenerator(strategy=\"dfs\", n_features=5, verbose=2)\n>>> X = fg.fit_transform(X, y)\n\nFitting FeatureGenerator...\nGenerating new features...\n --> 5 new features were added.\n\n\n>>> # Note the radius error * worst smoothness column\n>>> print(X)\n\n       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  worst fractal dimension  mean area - perimeter error  mean texture * worst fractal dimension  symmetry error / concave points error  texture error * worst area  worst radius / compactness error\nindex                                                                                           ...                                                                                                                                                                                                   \n0            17.99         10.38          122.80     1001.0          0.11840           0.27760  ...                  0.11890                      992.411                                1.234182                               1.892250                   1827.8007                        517.536705\n1            20.57         17.77          132.90     1326.0          0.08474           0.07864  ...                  0.08902                     1322.602                                1.581885                               1.036567                   1435.5084                       1910.550459\n2            19.69         21.25          130.00     1203.0          0.10960           0.15990  ...                  0.08758                     1198.415                                1.861075                               1.093294                   1344.8121                        588.367449\n3            11.42         20.38           77.58      386.1          0.14250           0.28390  ...                  0.17300                      382.655                                3.525740                               3.193894                    656.2612                        199.919549\n4            20.29         14.34          135.10     1297.0          0.10030           0.13280  ...                  0.07678                     1291.562                                1.101025                               0.931565                   1230.5475                        915.887850\n...            ...           ...             ...        ...              ...               ...  ...                      ...                          ...                                     ...                                    ...                         ...                               ...\n564          21.56         22.39          142.00     1479.0          0.11100           0.11590  ...                  0.07115                     1471.327                                1.593049                               0.453953                   2545.9120                        880.318229\n565          20.13         28.25          131.20     1261.0          0.09780           0.10340  ...                  0.06637                     1255.797                                1.874953                               1.131108                   4263.4530                        977.713578\n566          16.60         28.08          108.30      858.1          0.08455           0.10230  ...                  0.07820                      854.675                                2.195856                               0.846500                   1208.3000                        508.710801\n567          20.60         29.33          140.10     1265.0          0.11780           0.27700  ...                  0.12400                     1259.228                                3.636920                               1.396635                   2904.4950                        417.992855\n568           7.76         24.54           47.92      181.0          0.05263           0.04362  ...                  0.07039                      178.452                                1.727371                                    inf                    383.5608                       2029.184549\n\n[569 rows x 35 columns]\n
"}, {"location": "API/feature_engineering/featuregenerator/#methods", "title": "Methods", "text": "

fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGenerate new features.

method fit(X, y=None)[source]Fit to data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsself Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Generate new features.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Transformed feature set.

"}, {"location": "API/feature_engineering/featuregrouper/", "title": "FeatureGrouper", "text": "

class atom.feature_engineering.FeatureGrouper(groups, operators=None, drop_columns=True, verbose=0, logger=None)[source]Extract statistics from similar features.

Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

This class can be accessed from atom through the feature_grouping method. Read more in the user guide.

Parametersgroups: dict Group names and features. A feature can belong to multiple groups.

operators: str, sequence or None, default=None Statistical operators to apply on the groups. Any operator from numpy or scipy.stats (checked in that order) that is applied on an array can be used. If None, it uses: min, max, mean, median, mode and std.

drop_columns: bool, default=True Whether to drop the columns in groups after transformation.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

FeatureExtractor Extract features from datetime columns.

FeatureGenerator Generate new features.

FeatureSelector Reduce the number of features in the data.

"}, {"location": "API/feature_engineering/featuregrouper/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_grouping({\"group1\": \"mean.*\"}, verbose=2)\n\nFitting FeatureGrouper...\nGrouping features...\n --> Group group1 successfully created.\n\n\n>>> print(atom.dataset)\n\n     radius error  texture error  perimeter error  area error  smoothness error  compactness error  concavity error  concave points error  symmetry error  ...  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)  target\n0          0.5190         2.9100            5.801       67.10          0.007545           0.060500         0.021340              0.018430         0.03056  ...          0.2311                  0.09203      0.07224       1132.0    130.736684        0.186400       0.07224   335.890773       0\n1          0.4564         1.0750            3.425       48.55          0.005903           0.037310         0.047300              0.015570         0.01318  ...          0.2218                  0.07820      0.05302        858.1    101.162786        0.130650       0.05302   254.320568       0\n2          0.2298         0.9988            1.534       22.18          0.002826           0.009105         0.013110              0.005174         0.01013  ...          0.2683                  0.06829      0.02847        758.6     89.400425        0.116550       0.02847   224.981976       0\n3          0.3117         0.8155            1.972       27.94          0.005217           0.015150         0.016780              0.012680         0.01669  ...          0.2723                  0.07071      0.05723        761.7     89.389875        0.138110       0.09462   226.081026       1\n4          0.3336         1.8600            2.041       19.91          0.011880           0.037470         0.045910              0.015440         0.02287  ...          0.2383                  0.09026      0.03068        334.2     43.414796        0.161250       0.03068    99.030712       1\n..            ...            ...              ...         ...               ...                ...              ...                   ...             ...  ...             ...                      ...          ...          ...           ...             ...           ...          ...     ...\n564        0.4727         1.2400            3.195       45.40          0.005718           0.011620         0.019980              0.011090         0.01410  ...          0.3029                  0.08216      0.05259        684.5     81.456503        0.128635       0.05259   202.924880       0\n565        0.8601         1.4800            7.029      111.70          0.008124           0.036110         0.054890              0.027650         0.03176  ...          0.2909                  0.05865      0.05024       1290.0    146.813205        0.170250       0.05024   383.094862       0\n566        0.2094         0.7636            1.231       17.67          0.008725           0.020030         0.023350              0.011320         0.02625  ...          0.3380                  0.09584      0.03370        513.7     62.632288        0.136750       0.03370   152.314252       1\n567        0.2818         0.7614            1.808       18.54          0.006142           0.006134         0.001835              0.003576         0.01637  ...          0.2738                  0.07685      0.00309        366.8     45.967364        0.109675       0.00309   108.819747       1\n568        0.2810         0.8135            3.369       23.81          0.004929           0.066570         0.076830              0.013680         0.01526  ...          0.2845                  0.12490      0.02833        542.9     66.369889        0.141200       0.02833   160.878141       1\n\n[569 rows x 27 columns]\n
>>> from atom.feature_engineering import FeatureGrouper\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fg = FeatureGrouper({\"group1\": [\"mean texture\", \"mean radius\"]}, verbose=2)\n>>> X = fg.transform(X)\n\nGrouping features...\n --> Group group1 successfully created.\n\n\n>>> print(X)\n\n     mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  mean fractal dimension  radius error  ...  worst concave points  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)\n0            122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419                 0.07871        1.0950  ...                0.2654          0.4601                  0.11890        10.38        17.99        14.185          14.185         10.38        3.805\n1            132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812                 0.05667        0.5435  ...                0.1860          0.2750                  0.08902        17.77        20.57        19.170          19.170         17.77        1.400\n2            130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069                 0.05999        0.7456  ...                0.2430          0.3613                  0.08758        19.69        21.25        20.470          20.470         19.69        0.780\n3             77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597                 0.09744        0.4956  ...                0.2575          0.6638                  0.17300        11.42        20.38        15.900          15.900         11.42        4.480\n4            135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809                 0.05883        0.7572  ...                0.1625          0.2364                  0.07678        14.34        20.29        17.315          17.315         14.34        2.975\n..              ...        ...              ...               ...             ...                  ...            ...                     ...           ...  ...                   ...             ...                      ...          ...          ...           ...             ...           ...          ...\n564          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726                 0.05623        1.1760  ...                0.2216          0.2060                  0.07115        21.56        22.39        21.975          21.975         21.56        0.415\n565          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752                 0.05533        0.7655  ...                0.1628          0.2572                  0.06637        20.13        28.25        24.190          24.190         20.13        4.060\n566          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590                 0.05648        0.4564  ...                0.1418          0.2218                  0.07820        16.60        28.08        22.340          22.340         16.60        5.740\n567          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397                 0.07016        0.7260  ...                0.2650          0.4087                  0.12400        20.60        29.33        24.965          24.965         20.60        4.365\n568           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587                 0.05884        0.3857  ...                0.0000          0.2871                  0.07039         7.76        24.54        16.150          16.150          7.76        8.390\n\n[569 rows x 34 columns]\n
"}, {"location": "API/feature_engineering/featuregrouper/#methods", "title": "Methods", "text": "

fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGroup features.

method fit(X=None, y=None, **fit_params)[source]Do nothing.

Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsself Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Group features.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Transformed feature set.

"}, {"location": "API/feature_engineering/featureselector/", "title": "FeatureSelector", "text": "

class atom.feature_engineering.FeatureSelector(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", verbose=0, logger=None, random_state=None, **kwargs)[source]Reduce the number of features in the data.

Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

This class can be accessed from atom through the feature_selection method. Read more in the user guide.

Warning

  • Ties between features with equal scores are broken in an unspecified way.
  • For strategy=\"rfecv\", the n_features parameter is the minimum number of features to select, not the actual number of features that the transformer returns. It may very well be that it returns more!

Info

  • The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"pca\" with dense datasets.
  • If strategy=\"pca\" and the data is dense and unscaled, it's scaled to mean=0 and std=1 before fitting the PCA transformer.
  • If strategy=\"pca\" and the provided data is sparse, the used estimator is TruncatedSVD, which works more efficiently with sparse matrices.

Tip

  • Use the plot_pca and plot_components methods to examine the results after using strategy=\"pca\".
  • Use the plot_rfecv method to examine the results after using strategy=\"rfecv\".
  • Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead.

Parametersstrategy: str or None, default=None Feature selection strategy to use. Choose from:

  • None: Do not perform any feature selection strategy.
  • \"univariate\": Univariate statistical F-test.
  • \"pca\": Principal Component Analysis.
  • \"sfm\": Select best features according to a model.
  • \"sfs\": Sequential Feature Selection.
  • \"rfe\": Recursive Feature Elimination.
  • \"rfecv\": RFE with cross-validated selection.
  • \"pso\": Particle Swarm Optimization.
  • \"hho\": Harris Hawks Optimization.
  • \"gwo\": Grey Wolf Optimization.
  • \"dfo\": Dragonfly Optimization.
  • \"go\": Genetic Optimization.

solver: str, func, estimator or None, default=None Solver/estimator to use for the feature selection strategy. See the corresponding documentation for an extended description of the choices. If None, the default value is used (only if strategy=\"pca\"). Choose from:

  • If strategy=\"univariate\":

    • \"f_classif\"
    • \"f_regression\"
    • \"mutual_info_classif\"
    • \"mutual_info_regression\"
    • \"chi2\"
    • Any function with signature func(X, y) -> tuple[scores, p-values].
  • If strategy=\"pca\":

    • If data is dense:

      • If engine=\"sklearn\":

        • \"auto\" (default)
        • \"full\"
        • \"arpack\"
        • \"randomized\"
      • If engine=\"sklearnex\":

        • \"full\" (default)
      • If engine=\"cuml\":

        • \"full\" (default)
        • \"jacobi\"
    • If data is sparse:

      • \"randomized\" (default)
      • \"arpack\"
  • for the remaining strategies: The base estimator. For sfm, rfe and rfecv, it should have either a feature_importances_ or coef_ attribute after fitting. You can use one of the predefined models. Add _class or _reg after the model's name to specify a classification or regression task, e.g., solver=\"LGB_reg\" (not necessary if called from atom). No default option.

n_features: int, float or None, default=None Number of features to select.

  • If None: Select all features.
  • If <1: Fraction of the total features to select.
  • If >=1: Number of features to select.

If strategy=\"sfm\" and the threshold parameter is not specified, the threshold is automatically set to -inf to select n_features number of features.

If strategy=\"rfecv\", n_features is the minimum number of features to select.

This parameter is ignored if any of the following strategies is selected: pso, hho, gwo, dfo, go.

min_repeated: int, float or None, default=2 Remove categorical features if there isn't any repeated value in at least min_repeated rows. The default is to keep all features with non-maximum variance, i.e., remove the features which number of unique values is equal to the number of rows (usually the case for names, IDs, etc...).

  • If None: No check for minimum repetition.
  • If >1: Minimum repetition number.
  • If <=1: Minimum repetition fraction.

max_repeated: int, float or None, default=1.0 Remove categorical features with the same value in at least max_repeated rows. The default is to keep all features with non-zero variance, i.e., remove the features that have the same value in all samples.

  • If None: No check for maximum repetition.
  • If >1: Maximum number of repeated occurences.
  • If <=1: Maximum fraction of repeated occurences.

max_correlation: float or None, default=1.0 Minimum absolute Pearson correlation to identify correlated features. For each group, it removes all except the feature with the highest correlation to y (if provided, else it removes all but the first). The default value removes equal columns. If None, skip this step.

n_jobs: int, default=1 Number of cores to use for parallel processing.

  • If >0: Number of cores to use.
  • If -1: Use all available cores.
  • If <-1: Use number of cores - 1 + n_jobs.

device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

  • \"data\":

    • \"numpy\"
    • \"pyarrow\"
    • \"modin\"
  • \"estimator\":

    • \"sklearn\"
    • \"sklearnex\"
    • \"cuml\"

backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

  • \"loky\": Single-node, process-based parallelism.
  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
  • \"threading\": Single-node, thread-based parallelism.
  • \"ray\": Multi-node, process-based parallelism.

verbose: int, default=0 Verbosity level of the class. Choose from:

  • 0 to not print anything.
  • 1 to print basic information.
  • 2 to print detailed information.

logger: str, Logger or None, default=None

  • If None: Logging isn't used.
  • If str: Name of the log file. Use \"auto\" for automatic naming.
  • Else: Python logging.Logger instance.

random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

**kwargs Any extra keyword argument for the strategy estimator. See the corresponding documentation for the available options.

Attributescollinear_: pd.DataFrame Information on the removed collinear features. Columns include:

  • drop: Name of the dropped feature.
  • corr_feature: Names of the correlated features.
  • corr_value: Corresponding correlation coefficients.

[strategy]_: sklearn transformer Object used to transform the data, e.g., fs.pca for the pca strategy.

feature_names_in_: np.ndarray Names of features seen during fit.

n_features_in_: int Number of features seen during fit.

See Also

FeatureExtractor Extract features from datetime columns.

FeatureGenerator Generate new features.

FeatureGrouper Extract statistics from similar features.

"}, {"location": "API/feature_engineering/featureselector/#example", "title": "Example", "text": "atomstand-alone
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_selection(strategy=\"pca\", n_features=12, verbose=2)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 12 components.\n   --> Explained variance ratio: 0.971\n\n\n>>> # Note that the column names changed\n>>> print(atom.dataset)\n\n         pca0      pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11  target\n0    1.933532  2.215152  1.268851 -1.776239  0.069615 -0.043647  0.281363  0.122942 -0.911086 -0.223754 -0.086316 -0.929486       1\n1    1.203025  6.706587  4.445104  0.087116  3.044271 -1.130720  0.820790 -0.593311 -1.004105  0.945411 -0.199241  0.948766       1\n2    4.506063 -1.419715 -1.216228  1.189962  0.227850  0.788522 -0.829805  0.521853 -0.381054  0.676945  0.004564  0.066630       0\n3   -2.179059  0.496110 -0.870279 -0.151235 -0.715354  0.983901 -0.232186  0.449653  0.350218  0.644448  0.280308 -0.544707       1\n4    0.708048  0.859536 -2.683579  0.295765  0.712158 -1.105250 -0.226270 -0.264257  0.494656 -0.643629 -0.152528 -0.008835       0\n..        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...     ...\n564 -2.477152 -1.482251 -0.389774 -0.333742  0.627651 -0.475717 -0.048757 -0.337669  0.382336  0.132000  0.204445  0.118625       1\n565 -0.400165  0.078366 -2.082886 -1.024593  0.623709 -1.003931  0.571384  0.248557 -0.489957 -0.397008 -0.132552 -0.162104       0\n566 -2.956303 -0.111232 -0.770455  0.035805  0.308638  0.311849  0.119611 -0.994997  0.495694 -0.130586  0.214798  0.358027       1\n567 -5.409548 -0.784989  1.540835  2.205277  0.249963  1.552586  1.837439 -0.796343  0.508352  0.011600 -0.066693 -0.006518       1\n568 -3.648393 -1.340745  0.503077  4.546174 -0.221396  1.229170  0.687803  0.711380  0.527799  0.139843 -0.958308  0.834252       1\n\n[569 rows x 13 columns]\n
>>> from atom.feature_engineering import FeatureSelector\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fs = FeatureSelector(strategy=\"pca\", n_features=12, verbose=2)\n>>> X = fs.fit_transform(X)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 12 components.\n   --> Explained variance ratio: 0.97\n\n\n>>> # Note that the column names changed\n>>> print(X)\n\n          pca0       pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11\n0     9.192837   1.948583 -1.123166  3.633731 -1.195110  1.411424  2.159370 -0.398407 -0.157118 -0.877402  0.262955 -0.859014\n1     2.387802  -3.768172 -0.529293  1.118264  0.621775  0.028656  0.013358  0.240988 -0.711905  1.106995  0.813120  0.157923\n2     5.733896  -1.075174 -0.551748  0.912083 -0.177086  0.541452 -0.668166  0.097374  0.024066  0.454275 -0.605604  0.124387\n3     7.122953  10.275589 -3.232790  0.152547 -2.960878  3.053422  1.429911  1.059565 -1.405440 -1.116975 -1.151514  1.011316\n4     3.935302  -1.948072  1.389767  2.940639  0.546747 -1.226495 -0.936213  0.636376 -0.263805  0.377704  0.651360 -0.110515\n..         ...        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...\n564   6.439315  -3.576817  2.459487  1.177314 -0.074824 -2.375193 -0.596130 -0.035471  0.987929  0.256989 -0.062651  0.123342\n565   3.793382  -3.584048  2.088476 -2.506028 -0.510723 -0.246710 -0.716326 -1.113360 -0.105207 -0.108632  0.244804  0.222753\n566   1.256179  -1.902297  0.562731 -2.089227  1.809991 -0.534447 -0.192758  0.341887  0.393917  0.520877 -0.840512  0.096473\n567  10.374794   1.672010 -1.877029 -2.356031 -0.033742  0.567936  0.223082 -0.280239 -0.542035 -0.089296 -0.178628 -0.697461\n568  -5.475243  -0.670637  1.490443 -2.299157 -0.184703  1.617837  1.698952  1.046354  0.374101 -0.047726 -0.144094 -0.179496\n\n[569 rows x 12 columns]\n
"}, {"location": "API/feature_engineering/featureselector/#methods", "title": "Methods", "text": "

fitFit the feature selector to the data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTransform the data.

method fit(X, y=None)[source]Fit the feature selector to the data.

The univariate, sfm (when model is not fitted), sfs, rfe and rfecv strategies need a target column. Leaving it None raises an exception.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsself Estimator instance.

method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

**fit_params Additional keyword arguments for the fit method.

Returnsdataframe Transformed feature set. Only returned if provided.

series or dataframe Transformed target column. Only returned if provided.

method get_metadata_routing()[source]Get metadata routing of this object.

Returnsrouting : MetadataRequest A :class:~sklearn.utils.metadata_routing.MetadataRequest encapsulating routing information.

method get_params(deep=True)[source]Get parameters for this estimator.

Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returnsparams : dict Parameter names mapped to their values.

method inverse_transform(X=None, y=None)[source]Do nothing.

Returns the input unchanged. Implemented for continuity of the API.

ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

  • If None: y is ignored.
  • If int: Position of the target column in X.
  • If str: Name of the target column in X.
  • If dict: Name of the target column and sequence of values.
  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

Returnsdataframe Feature set. Only returned if provided.

series or dataframe Target column. Only returned if provided.

method set_params(**params)[source]Set the parameters of this estimator.

Parameters**params : dict Estimator parameters.

Returnsself : estimator instance Estimator instance.

method transform(X, y=None)[source]Transform the data.

ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

Returnsdataframe Transformed feature set.

"}, {"location": "API/models/adab/", "title": "AdaBoost", "text": "

AdaB accept sparse

AdaBoost is a meta-estimator that begins by fitting a classifier/regressor on the original dataset and then fits additional copies of the algorithm on the same dataset but where the weights of instances are adjusted according to the error of the current prediction.

Corresponding estimators are:

  • AdaBoostClassifier for classification tasks.
  • AdaBoostRegressor for regression tasks.

Read more in sklearn's documentation.

See Also

GradientBoostingMachine Gradient Boosting Machine.

RandomForest Random Forest.

XGBoost Extreme Gradient Boosting.

"}, {"location": "API/models/adab/#example", "title": "Example", "text": "
>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"AdaB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: AdaB\nMetric: f1\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.221s\n-------------------------------------------------\nTime: 0.221s\n\n\nFinal results ==================== >>\nTotal time: 0.224s\n-------------------------------------\nAdaBoost --> f1: 0.9583\n
"}, {"location": "API/models/adab/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)algorithmCategoricalDistribution(choices=('SAMME.R', 'SAMME'))

Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)lossCategoricalDistribution(choices=('linear', 'square', 'exponential'))

"}, {"location": "API/models/adab/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/adab/#data-attributes", "title": "Data attributes", "text": "

Attributespipeline: PipelinePipeline of transforms.

Models that used automated feature scaling have the scaler added.

Tip

Use the plot_pipeline method to visualize the pipeline.

mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

"}, {"location": "API/models/adab/#utility-attributes", "title": "Utility attributes", "text": "

Attributesname: strName of the model.

Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

  • [param_name]: Parameter value used in this trial.
  • estimator: Estimator used in this trial.
  • [metric_name]: Metric score of the trial.
  • [best_metric_name]: Best score so far in this study.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

    All durations are in seconds. Possible values include:

    • [metric]_ht: Score obtained by the hyperparameter tuning.
    • time_ht: Duration of the hyperparameter tuning.
    • [metric]_train: Metric score on the train set.
    • [metric]_test: Metric score on the test set.
    • time_fit: Duration of the model fitting on the train set.
    • [metric]_bootstrap: Mean score on the bootstrapped samples.
    • time_bootstrap: Duration of the bootstrapping.
    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

      "}, {"location": "API/models/adab/#methods", "title": "Methods", "text": "

      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

      reset: bool, default=False Whether to start a new run or continue the existing one.

      method calibrate(**kwargs)[source]Calibrate the model.

      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

      Parametersrows: int, default=1 Number of plots in length.

      cols: int, default=2 Number of plots in width.

      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

      title: str, dict or None, default=None Title for the plot.

      • If None, no title is shown.
      • If str, text for the title.
      • If dict, title configuration.

      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

      • If None: No legend is shown.
      • If str: Location where to show the legend.
      • If dict: Legend configuration.

      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

      display: bool, default=True Whether to render the plot.

      Yieldsgo.Figure Plot object.

      method clear()[source]Reset attributes and clear cache from the model.

      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

      • In-training validation scores
      • Cached predictions.
      • Shap values
      • App instance
      • Dashboard instance
      • Calculated holdout data sets

      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

      Note

      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

      Returnspd.DataFrame Overview of the results.

      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

      Read more in the user guide.

      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

      Tip

      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

      • The task is binary or multilabel classification.
      • The model has a predict_proba method.
      • The metric evaluates predicted probabilities.

      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

      Returnspd.Series Scores of the model.

      method export_pipeline()[source]Export the transformer pipeline with final estimator.

      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

      method fit(X=None, y=None)[source]Fit and validate the model.

      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

      Warning

      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

      Only available for models with a predict_proba method in a binary or multilabel classification task.

      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

      Parametersn_trials: int Number of trials for the hyperparameter tuning.

      reset: bool, default=False Whether to start a new study or continue the existing one.

      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

      • If None: y is ignored.
      • If int: Position of the target column in X.
      • If str: Name of the target column in X.
      • If dict: Name of the target column and sequence of values.
      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
      • If dataframe: Target columns for multioutput tasks.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsdataframe Original feature set. Only returned if provided.

      series or dataframe Original target column. Only returned if provided.

      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

      Read more in the user guide.

      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

      Read more in the user guide.

      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

      Read more in the user guide.

      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

      stage: str, default=\"None\" New desired stage for the model.

      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

      Read more in the user guide.

      Info

      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

      • If None: X must be a selection of rows in the dataset.
      • If int: Position of the target column in X.
      • If str: Name of the target column in X.
      • If dict: Name of the target column and sequence of values.
      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
      • If dataframe: Target columns for multioutput tasks.

      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

      sample_weight: sequence or None, default=None Sample weights corresponding to y.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsfloat Metric score of X with respect to y.

      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

      Tip

      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

      port: int, default=8000 Port for HTTP server.

      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

      • If None: y is ignored.
      • If int: Position of the target column in X.
      • If str: Name of the target column in X.
      • If dict: Name of the target column and sequence of values.
      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
      • If dataframe: Target columns for multioutput tasks.

      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

      Returnsdataframe Transformed feature set. Only returned if provided.

      series or dataframe Transformed target column. Only returned if provided.

      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

      Recursively update the structure of the original layout with the values in the arguments.

      Parameters**kwargs Keyword arguments for the figure's update_layout method.

      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

      Recursively update the structure of the original traces with the values in the arguments.

      Parameters**kwargs Keyword arguments for the figure's update_traces method.

      "}, {"location": "API/models/ard/", "title": "AutomaticRelevanceDetermination", "text": "

      ARD needs scaling

      Automatic Relevance Determination is very similar to BayesianRidge, but can lead to sparser coefficients. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions.

      Corresponding estimators are:

      • ARDRegression for regression tasks.

      Read more in sklearn's documentation.

      See Also

      BayesianRidge Bayesian ridge regression.

      GaussianProcess Gaussian process.

      LeastAngleRegression Least Angle Regression.

      "}, {"location": "API/models/ard/#example", "title": "Example", "text": "
      >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"ARD\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: ARD\nMetric: r2\n\n\nResults for AutomaticRelevanceDetermination:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6029\nTime elapsed: 0.139s\n-------------------------------------------------\nTime: 0.139s\n\n\nFinal results ==================== >>\nTotal time: 0.140s\n-------------------------------------\nAutomaticRelevanceDetermination --> r2: 0.6029\n
      "}, {"location": "API/models/ard/#hyperparameters", "title": "Hyperparameters", "text": "

      Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)

      "}, {"location": "API/models/ard/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ard/#data-attributes", "title": "Data attributes", "text": "

      Attributespipeline: PipelinePipeline of transforms.

      Models that used automated feature scaling have the scaler added.

      Tip

      Use the plot_pipeline method to visualize the pipeline.

      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

      "}, {"location": "API/models/ard/#utility-attributes", "title": "Utility attributes", "text": "

      Attributesname: strName of the model.

      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

      • [param_name]: Parameter value used in this trial.
      • estimator: Estimator used in this trial.
      • [metric_name]: Metric score of the trial.
      • [best_metric_name]: Best score so far in this study.
      • time_trial: Duration of the trial.
      • time_ht: Duration of the hyperparameter tuning.
      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

        All durations are in seconds. Possible values include:

        • [metric]_ht: Score obtained by the hyperparameter tuning.
        • time_ht: Duration of the hyperparameter tuning.
        • [metric]_train: Metric score on the train set.
        • [metric]_test: Metric score on the test set.
        • time_fit: Duration of the model fitting on the train set.
        • [metric]_bootstrap: Mean score on the bootstrapped samples.
        • time_bootstrap: Duration of the bootstrapping.
        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

          "}, {"location": "API/models/ard/#methods", "title": "Methods", "text": "

          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

          reset: bool, default=False Whether to start a new run or continue the existing one.

          method calibrate(**kwargs)[source]Calibrate the model.

          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

          Parametersrows: int, default=1 Number of plots in length.

          cols: int, default=2 Number of plots in width.

          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

          title: str, dict or None, default=None Title for the plot.

          • If None, no title is shown.
          • If str, text for the title.
          • If dict, title configuration.

          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

          • If None: No legend is shown.
          • If str: Location where to show the legend.
          • If dict: Legend configuration.

          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

          display: bool, default=True Whether to render the plot.

          Yieldsgo.Figure Plot object.

          method clear()[source]Reset attributes and clear cache from the model.

          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

          • In-training validation scores
          • Cached predictions.
          • Shap values
          • App instance
          • Dashboard instance
          • Calculated holdout data sets

          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

          Note

          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

          Returnspd.DataFrame Overview of the results.

          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

          Read more in the user guide.

          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

          Tip

          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

          • The task is binary or multilabel classification.
          • The model has a predict_proba method.
          • The metric evaluates predicted probabilities.

          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

          Returnspd.Series Scores of the model.

          method export_pipeline()[source]Export the transformer pipeline with final estimator.

          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

          method fit(X=None, y=None)[source]Fit and validate the model.

          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

          Warning

          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

          Only available for models with a predict_proba method in a binary or multilabel classification task.

          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

          Parametersn_trials: int Number of trials for the hyperparameter tuning.

          reset: bool, default=False Whether to start a new study or continue the existing one.

          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

          • If None: y is ignored.
          • If int: Position of the target column in X.
          • If str: Name of the target column in X.
          • If dict: Name of the target column and sequence of values.
          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
          • If dataframe: Target columns for multioutput tasks.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsdataframe Original feature set. Only returned if provided.

          series or dataframe Original target column. Only returned if provided.

          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

          Read more in the user guide.

          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

          Read more in the user guide.

          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

          Read more in the user guide.

          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

          stage: str, default=\"None\" New desired stage for the model.

          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

          Read more in the user guide.

          Info

          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

          • If None: X must be a selection of rows in the dataset.
          • If int: Position of the target column in X.
          • If str: Name of the target column in X.
          • If dict: Name of the target column and sequence of values.
          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
          • If dataframe: Target columns for multioutput tasks.

          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

          sample_weight: sequence or None, default=None Sample weights corresponding to y.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsfloat Metric score of X with respect to y.

          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

          Tip

          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

          port: int, default=8000 Port for HTTP server.

          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

          • If None: y is ignored.
          • If int: Position of the target column in X.
          • If str: Name of the target column in X.
          • If dict: Name of the target column and sequence of values.
          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
          • If dataframe: Target columns for multioutput tasks.

          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

          Returnsdataframe Transformed feature set. Only returned if provided.

          series or dataframe Transformed target column. Only returned if provided.

          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

          Recursively update the structure of the original layout with the values in the arguments.

          Parameters**kwargs Keyword arguments for the figure's update_layout method.

          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

          Recursively update the structure of the original traces with the values in the arguments.

          Parameters**kwargs Keyword arguments for the figure's update_traces method.

          "}, {"location": "API/models/arima/", "title": "ARIMA", "text": "

          ARIMA native multioutput

          Seasonal ARIMA models and exogeneous input is supported, hence this estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.

          An ARIMA model, is a generalization of an autoregressive moving average (ARMA) model, and is fitted to time-series data in an effort to forecast future points. ARIMA models can be especially efficacious in cases where data shows evidence of non-stationarity.

          The \"AR\" part of ARIMA indicates that the evolving variable of interest is regressed on its own lagged (i.e., prior observed) values. The \"MA\" part indicates that the regression error is actually a linear combination of error terms whose values occurred contemporaneously and at various times in the past. The \"I\" (for \"integrated\") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once).

          Corresponding estimators are:

          • ARIMA for forecasting tasks.

          Warning

          ARIMA often runs into numerical errors when optimizing the hyperparameters. Possible solutions are:

          • Use the AutoARIMA model instead.
          • Use est_params to specify the orders manually, e.g., atom.run(\"arima\", n_trials=5,est_params={\"order\": (1, 1, 0)}).
          • Use the catch parameter in ht_params to avoid raising every exception, e.g., atom.run(\"arima\",n_trials=5, ht_params={\"catch\": (Exception,)}).

          See Also

          AutoARIMA Automatic Autoregressive Integrated Moving Average Model.

          "}, {"location": "API/models/arima/#example", "title": "Example", "text": "
          >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_longley\n\n>>> _, X = load_longley()\n\n>>> atom = ATOMForecaster(X)\n>>> atom.run(models=\"ARIMA\", verbose=2)\n\n\nTraining ========================= >>\nModels: ARIMA\nMetric: mape\n\n\nResults for ARIMA:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0131\nTest evaluation --> mape: -0.0364\nTime elapsed: 0.214s\n-------------------------------------------------\nTime: 0.214s\n\n\nFinal results ==================== >>\nTotal time: 0.215s\n-------------------------------------\nARIMA --> mape: -0.0364\n
          "}, {"location": "API/models/arima/#hyperparameters", "title": "Hyperparameters", "text": "

          ParameterspIntDistribution(high=2, log=False, low=0, step=1)dIntDistribution(high=1, log=False, low=0, step=1)qIntDistribution(high=2, log=False, low=0, step=1)PIntDistribution(high=2, log=False, low=0, step=1)DIntDistribution(high=1, log=False, low=0, step=1)QIntDistribution(high=2, log=False, low=0, step=1)SCategoricalDistribution(choices=(0, 4, 6, 7, 12))methodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))

          "}, {"location": "API/models/arima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/arima/#data-attributes", "title": "Data attributes", "text": "

          Attributespipeline: PipelinePipeline of transforms.

          Models that used automated feature scaling have the scaler added.

          Tip

          Use the plot_pipeline method to visualize the pipeline.

          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

          "}, {"location": "API/models/arima/#utility-attributes", "title": "Utility attributes", "text": "

          Attributesname: strName of the model.

          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

          • [param_name]: Parameter value used in this trial.
          • estimator: Estimator used in this trial.
          • [metric_name]: Metric score of the trial.
          • [best_metric_name]: Best score so far in this study.
          • time_trial: Duration of the trial.
          • time_ht: Duration of the hyperparameter tuning.
          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

            All durations are in seconds. Possible values include:

            • [metric]_ht: Score obtained by the hyperparameter tuning.
            • time_ht: Duration of the hyperparameter tuning.
            • [metric]_train: Metric score on the train set.
            • [metric]_test: Metric score on the test set.
            • time_fit: Duration of the model fitting on the train set.
            • [metric]_bootstrap: Mean score on the bootstrapped samples.
            • time_bootstrap: Duration of the bootstrapping.
            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

              "}, {"location": "API/models/arima/#methods", "title": "Methods", "text": "

              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

              reset: bool, default=False Whether to start a new run or continue the existing one.

              method calibrate(**kwargs)[source]Calibrate the model.

              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

              Parametersrows: int, default=1 Number of plots in length.

              cols: int, default=2 Number of plots in width.

              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

              title: str, dict or None, default=None Title for the plot.

              • If None, no title is shown.
              • If str, text for the title.
              • If dict, title configuration.

              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

              • If None: No legend is shown.
              • If str: Location where to show the legend.
              • If dict: Legend configuration.

              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

              display: bool, default=True Whether to render the plot.

              Yieldsgo.Figure Plot object.

              method clear()[source]Reset attributes and clear cache from the model.

              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

              • In-training validation scores
              • Cached predictions.
              • Shap values
              • App instance
              • Dashboard instance
              • Calculated holdout data sets

              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

              Note

              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

              Returnspd.DataFrame Overview of the results.

              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

              Tip

              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

              • The task is binary or multilabel classification.
              • The model has a predict_proba method.
              • The metric evaluates predicted probabilities.

              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

              Returnspd.Series Scores of the model.

              method export_pipeline()[source]Export the transformer pipeline with final estimator.

              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

              method fit(X=None, y=None)[source]Fit and validate the model.

              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

              Warning

              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

              Only available for models with a predict_proba method in a binary or multilabel classification task.

              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

              Parametersn_trials: int Number of trials for the hyperparameter tuning.

              reset: bool, default=False Whether to start a new study or continue the existing one.

              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

              • If None: y is ignored.
              • If int: Position of the target column in X.
              • If str: Name of the target column in X.
              • If dict: Name of the target column and sequence of values.
              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
              • If dataframe: Target columns for multioutput tasks.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsdataframe Original feature set. Only returned if provided.

              series or dataframe Original target column. Only returned if provided.

              method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

              Read more in the user guide.

              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

              method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

              Read more in the user guide.

              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

              method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

              Read more in the user guide.

              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              marginal: bool, default=True Whether returned distribution is marginal by time index.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnssktime.proba.Normal Predicted distribution.

              method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

              Read more in the user guide.

              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

              method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

              Read more in the user guide.

              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

              method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

              Read more in the user guide.

              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

              stage: str, default=\"None\" New desired stage for the model.

              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

              method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

              Read more in the user guide.

              Info

              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

              fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsfloat Metric score of y with respect to a ground truth.

              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

              Tip

              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

              port: int, default=8000 Port for HTTP server.

              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

              • If None: y is ignored.
              • If int: Position of the target column in X.
              • If str: Name of the target column in X.
              • If dict: Name of the target column and sequence of values.
              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
              • If dataframe: Target columns for multioutput tasks.

              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

              Returnsdataframe Transformed feature set. Only returned if provided.

              series or dataframe Transformed target column. Only returned if provided.

              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

              Recursively update the structure of the original layout with the values in the arguments.

              Parameters**kwargs Keyword arguments for the figure's update_layout method.

              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

              Recursively update the structure of the original traces with the values in the arguments.

              Parameters**kwargs Keyword arguments for the figure's update_traces method.

              "}, {"location": "API/models/autoarima/", "title": "AutoARIMA", "text": "

              AutoARIMA native multioutput

              ARIMA implementation that includes automated fitting of (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA algorithm seeks to identify the most optimal parameters for an ARIMA model, settling on a single fitted ARIMA model. This process is based on the commonly-used R function.

              AutoARIMA works by conducting differencing tests (i.e., Kwiatkowski\u2013Phillips\u2013Schmidt\u2013Shin, Augmented Dickey-Fuller or Phillips\u2013Perron) to determine the order of differencing, d, and then fitting models within defined ranges. AutoARIMA also seeks to identify the optimal P and Q hyperparameters after conducting the Canova-Hansen to determine the optimal order of seasonal differencing.

              Note that due to stationarity issues, AutoARIMA might not find a suitable model that will converge. If this is the case, a ValueError is thrown suggesting stationarity-inducing measures be taken prior to re-fitting or that a new range of order values be selected.

              Corresponding estimators are:

              • AutoARIMA for forecasting tasks.

              See Also

              ARIMA Autoregressive Integrated Moving Average Model.

              ETS ETS model with automatic fitting capabilities.

              "}, {"location": "API/models/autoarima/#example", "title": "Example", "text": "
              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_longley\n\n>>> _, X = load_longley()\n\n>>> atom = ATOMForecaster(X, random_state=1)\n>>> atom.run(models=\"autoarima\", verbose=2)\n\n\nTraining ========================= >>\nModels: AutoARIMA\nMetric: mape\n\n\nResults for AutoARIMA:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0131\nTest evaluation --> mape: -0.0359\nTime elapsed: 0.437s\n-------------------------------------------------\nTime: 0.437s\n\n\nFinal results ==================== >>\nTotal time: 0.438s\n-------------------------------------\nAutoARIMA --> mape: -0.0359\n
              "}, {"location": "API/models/autoarima/#hyperparameters", "title": "Hyperparameters", "text": "

              ParametersmethodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))

              "}, {"location": "API/models/autoarima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/autoarima/#data-attributes", "title": "Data attributes", "text": "

              Attributespipeline: PipelinePipeline of transforms.

              Models that used automated feature scaling have the scaler added.

              Tip

              Use the plot_pipeline method to visualize the pipeline.

              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

              "}, {"location": "API/models/autoarima/#utility-attributes", "title": "Utility attributes", "text": "

              Attributesname: strName of the model.

              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

              • [param_name]: Parameter value used in this trial.
              • estimator: Estimator used in this trial.
              • [metric_name]: Metric score of the trial.
              • [best_metric_name]: Best score so far in this study.
              • time_trial: Duration of the trial.
              • time_ht: Duration of the hyperparameter tuning.
              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                All durations are in seconds. Possible values include:

                • [metric]_ht: Score obtained by the hyperparameter tuning.
                • time_ht: Duration of the hyperparameter tuning.
                • [metric]_train: Metric score on the train set.
                • [metric]_test: Metric score on the test set.
                • time_fit: Duration of the model fitting on the train set.
                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                • time_bootstrap: Duration of the bootstrapping.
                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                  "}, {"location": "API/models/autoarima/#methods", "title": "Methods", "text": "

                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                  reset: bool, default=False Whether to start a new run or continue the existing one.

                  method calibrate(**kwargs)[source]Calibrate the model.

                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                  Parametersrows: int, default=1 Number of plots in length.

                  cols: int, default=2 Number of plots in width.

                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                  title: str, dict or None, default=None Title for the plot.

                  • If None, no title is shown.
                  • If str, text for the title.
                  • If dict, title configuration.

                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                  • If None: No legend is shown.
                  • If str: Location where to show the legend.
                  • If dict: Legend configuration.

                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                  display: bool, default=True Whether to render the plot.

                  Yieldsgo.Figure Plot object.

                  method clear()[source]Reset attributes and clear cache from the model.

                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                  • In-training validation scores
                  • Cached predictions.
                  • Shap values
                  • App instance
                  • Dashboard instance
                  • Calculated holdout data sets

                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                  Note

                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                  Returnspd.DataFrame Overview of the results.

                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                  Tip

                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                  • The task is binary or multilabel classification.
                  • The model has a predict_proba method.
                  • The metric evaluates predicted probabilities.

                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                  Returnspd.Series Scores of the model.

                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                  method fit(X=None, y=None)[source]Fit and validate the model.

                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                  Warning

                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                  reset: bool, default=False Whether to start a new study or continue the existing one.

                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                  • If None: y is ignored.
                  • If int: Position of the target column in X.
                  • If str: Name of the target column in X.
                  • If dict: Name of the target column and sequence of values.
                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                  • If dataframe: Target columns for multioutput tasks.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsdataframe Original feature set. Only returned if provided.

                  series or dataframe Original target column. Only returned if provided.

                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                  Read more in the user guide.

                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                  Read more in the user guide.

                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                  Read more in the user guide.

                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnssktime.proba.Normal Predicted distribution.

                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                  Read more in the user guide.

                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                  Read more in the user guide.

                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                  Read more in the user guide.

                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                  stage: str, default=\"None\" New desired stage for the model.

                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                  Read more in the user guide.

                  Info

                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsfloat Metric score of y with respect to a ground truth.

                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                  Tip

                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                  port: int, default=8000 Port for HTTP server.

                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                  • If None: y is ignored.
                  • If int: Position of the target column in X.
                  • If str: Name of the target column in X.
                  • If dict: Name of the target column and sequence of values.
                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                  • If dataframe: Target columns for multioutput tasks.

                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                  Returnsdataframe Transformed feature set. Only returned if provided.

                  series or dataframe Transformed target column. Only returned if provided.

                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                  Recursively update the structure of the original layout with the values in the arguments.

                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                  Recursively update the structure of the original traces with the values in the arguments.

                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                  "}, {"location": "API/models/bag/", "title": "Bagging", "text": "

                  Bag accept sparse

                  Bagging uses an ensemble meta-estimator that fits base predictors on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator by introducing randomization into its construction procedure and then making an ensemble out of it.

                  Corresponding estimators are:

                  • BaggingClassifier for classification tasks.
                  • BaggingRegressor for regression tasks.

                  Read more in sklearn's documentation.

                  See Also

                  DecisionTree Single Decision Tree.

                  LogisticRegression Logistic Regression.

                  RandomForest Random Forest.

                  "}, {"location": "API/models/bag/#example", "title": "Example", "text": "
                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Bag\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Bag\nMetric: f1\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9982\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 0.101s\n\n\nFinal results ==================== >>\nTotal time: 0.104s\n-------------------------------------\nBagging --> f1: 0.9444\n
                  "}, {"location": "API/models/bag/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))

                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))

                  "}, {"location": "API/models/bag/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bag/#data-attributes", "title": "Data attributes", "text": "

                  Attributespipeline: PipelinePipeline of transforms.

                  Models that used automated feature scaling have the scaler added.

                  Tip

                  Use the plot_pipeline method to visualize the pipeline.

                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                  "}, {"location": "API/models/bag/#utility-attributes", "title": "Utility attributes", "text": "

                  Attributesname: strName of the model.

                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                  • [param_name]: Parameter value used in this trial.
                  • estimator: Estimator used in this trial.
                  • [metric_name]: Metric score of the trial.
                  • [best_metric_name]: Best score so far in this study.
                  • time_trial: Duration of the trial.
                  • time_ht: Duration of the hyperparameter tuning.
                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                    All durations are in seconds. Possible values include:

                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                    • time_ht: Duration of the hyperparameter tuning.
                    • [metric]_train: Metric score on the train set.
                    • [metric]_test: Metric score on the test set.
                    • time_fit: Duration of the model fitting on the train set.
                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                    • time_bootstrap: Duration of the bootstrapping.
                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                      "}, {"location": "API/models/bag/#methods", "title": "Methods", "text": "

                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                      reset: bool, default=False Whether to start a new run or continue the existing one.

                      method calibrate(**kwargs)[source]Calibrate the model.

                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                      Parametersrows: int, default=1 Number of plots in length.

                      cols: int, default=2 Number of plots in width.

                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                      title: str, dict or None, default=None Title for the plot.

                      • If None, no title is shown.
                      • If str, text for the title.
                      • If dict, title configuration.

                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                      • If None: No legend is shown.
                      • If str: Location where to show the legend.
                      • If dict: Legend configuration.

                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                      display: bool, default=True Whether to render the plot.

                      Yieldsgo.Figure Plot object.

                      method clear()[source]Reset attributes and clear cache from the model.

                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                      • In-training validation scores
                      • Cached predictions.
                      • Shap values
                      • App instance
                      • Dashboard instance
                      • Calculated holdout data sets

                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                      Note

                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                      Returnspd.DataFrame Overview of the results.

                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                      Read more in the user guide.

                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                      Tip

                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                      • The task is binary or multilabel classification.
                      • The model has a predict_proba method.
                      • The metric evaluates predicted probabilities.

                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                      Returnspd.Series Scores of the model.

                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                      method fit(X=None, y=None)[source]Fit and validate the model.

                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                      Warning

                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                      reset: bool, default=False Whether to start a new study or continue the existing one.

                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                      • If None: y is ignored.
                      • If int: Position of the target column in X.
                      • If str: Name of the target column in X.
                      • If dict: Name of the target column and sequence of values.
                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                      • If dataframe: Target columns for multioutput tasks.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsdataframe Original feature set. Only returned if provided.

                      series or dataframe Original target column. Only returned if provided.

                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                      Read more in the user guide.

                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                      Read more in the user guide.

                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                      Read more in the user guide.

                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                      stage: str, default=\"None\" New desired stage for the model.

                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                      Read more in the user guide.

                      Info

                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                      • If None: X must be a selection of rows in the dataset.
                      • If int: Position of the target column in X.
                      • If str: Name of the target column in X.
                      • If dict: Name of the target column and sequence of values.
                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                      • If dataframe: Target columns for multioutput tasks.

                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsfloat Metric score of X with respect to y.

                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                      Tip

                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                      port: int, default=8000 Port for HTTP server.

                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                      • If None: y is ignored.
                      • If int: Position of the target column in X.
                      • If str: Name of the target column in X.
                      • If dict: Name of the target column and sequence of values.
                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                      • If dataframe: Target columns for multioutput tasks.

                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                      Returnsdataframe Transformed feature set. Only returned if provided.

                      series or dataframe Transformed target column. Only returned if provided.

                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                      Recursively update the structure of the original layout with the values in the arguments.

                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                      Recursively update the structure of the original traces with the values in the arguments.

                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                      "}, {"location": "API/models/bnb/", "title": "BernoulliNB", "text": "

                      BNB accept sparse supports acceleration

                      BernoulliNB implements the Naive Bayes algorithm for multivariate Bernoulli models. Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MNB works with occurrence counts, BNB is designed for binary/boolean features.

                      Corresponding estimators are:

                      • BernoulliNB for classification tasks.

                      Read more in sklearn's documentation.

                      See Also

                      ComplementNB Complement Naive Bayes.

                      CategoricalNB Categorical Naive Bayes.

                      MultinomialNB Multinomial Naive Bayes.

                      "}, {"location": "API/models/bnb/#example", "title": "Example", "text": "
                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"BNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: BNB\nMetric: f1\n\n\nResults for BernoulliNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7709\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== >>\nTotal time: 0.026s\n-------------------------------------\nBernoulliNB --> f1: 0.7717\n
                      "}, {"location": "API/models/bnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                      "}, {"location": "API/models/bnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bnb/#data-attributes", "title": "Data attributes", "text": "

                      Attributespipeline: PipelinePipeline of transforms.

                      Models that used automated feature scaling have the scaler added.

                      Tip

                      Use the plot_pipeline method to visualize the pipeline.

                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                      "}, {"location": "API/models/bnb/#utility-attributes", "title": "Utility attributes", "text": "

                      Attributesname: strName of the model.

                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                      • [param_name]: Parameter value used in this trial.
                      • estimator: Estimator used in this trial.
                      • [metric_name]: Metric score of the trial.
                      • [best_metric_name]: Best score so far in this study.
                      • time_trial: Duration of the trial.
                      • time_ht: Duration of the hyperparameter tuning.
                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                        All durations are in seconds. Possible values include:

                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                        • time_ht: Duration of the hyperparameter tuning.
                        • [metric]_train: Metric score on the train set.
                        • [metric]_test: Metric score on the test set.
                        • time_fit: Duration of the model fitting on the train set.
                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                        • time_bootstrap: Duration of the bootstrapping.
                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                          "}, {"location": "API/models/bnb/#methods", "title": "Methods", "text": "

                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                          reset: bool, default=False Whether to start a new run or continue the existing one.

                          method calibrate(**kwargs)[source]Calibrate the model.

                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                          Parametersrows: int, default=1 Number of plots in length.

                          cols: int, default=2 Number of plots in width.

                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                          title: str, dict or None, default=None Title for the plot.

                          • If None, no title is shown.
                          • If str, text for the title.
                          • If dict, title configuration.

                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                          • If None: No legend is shown.
                          • If str: Location where to show the legend.
                          • If dict: Legend configuration.

                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                          display: bool, default=True Whether to render the plot.

                          Yieldsgo.Figure Plot object.

                          method clear()[source]Reset attributes and clear cache from the model.

                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                          • In-training validation scores
                          • Cached predictions.
                          • Shap values
                          • App instance
                          • Dashboard instance
                          • Calculated holdout data sets

                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                          Note

                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                          Returnspd.DataFrame Overview of the results.

                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                          Read more in the user guide.

                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                          Tip

                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                          • The task is binary or multilabel classification.
                          • The model has a predict_proba method.
                          • The metric evaluates predicted probabilities.

                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                          Returnspd.Series Scores of the model.

                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                          method fit(X=None, y=None)[source]Fit and validate the model.

                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                          Warning

                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                          reset: bool, default=False Whether to start a new study or continue the existing one.

                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                          • If None: y is ignored.
                          • If int: Position of the target column in X.
                          • If str: Name of the target column in X.
                          • If dict: Name of the target column and sequence of values.
                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                          • If dataframe: Target columns for multioutput tasks.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsdataframe Original feature set. Only returned if provided.

                          series or dataframe Original target column. Only returned if provided.

                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                          Read more in the user guide.

                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                          Read more in the user guide.

                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                          Read more in the user guide.

                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                          stage: str, default=\"None\" New desired stage for the model.

                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                          Read more in the user guide.

                          Info

                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                          • If None: X must be a selection of rows in the dataset.
                          • If int: Position of the target column in X.
                          • If str: Name of the target column in X.
                          • If dict: Name of the target column and sequence of values.
                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                          • If dataframe: Target columns for multioutput tasks.

                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsfloat Metric score of X with respect to y.

                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                          Tip

                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                          port: int, default=8000 Port for HTTP server.

                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                          • If None: y is ignored.
                          • If int: Position of the target column in X.
                          • If str: Name of the target column in X.
                          • If dict: Name of the target column and sequence of values.
                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                          • If dataframe: Target columns for multioutput tasks.

                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                          Returnsdataframe Transformed feature set. Only returned if provided.

                          series or dataframe Transformed target column. Only returned if provided.

                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                          Recursively update the structure of the original layout with the values in the arguments.

                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                          Recursively update the structure of the original traces with the values in the arguments.

                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                          "}, {"location": "API/models/br/", "title": "BayesianRidge", "text": "

                          BR needs scaling

                          Bayesian regression techniques can be used to include regularization parameters in the estimation procedure: the regularization parameter is not set in a hard sense but tuned to the data at hand.

                          Corresponding estimators are:

                          • BayesianRidge for regression tasks.

                          Read more in sklearn's documentation.

                          See Also

                          AutomaticRelevanceDetermination Automatic Relevance Determination.

                          GaussianProcess Gaussian process.

                          LeastAngleRegression Least Angle Regression.

                          "}, {"location": "API/models/br/#example", "title": "Example", "text": "
                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"BR\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: BR\nMetric: r2\n\n\nResults for BayesianRidge:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.138s\n-------------------------------------------------\nTime: 0.138s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nBayesianRidge --> r2: 0.6028\n
                          "}, {"location": "API/models/br/#hyperparameters", "title": "Hyperparameters", "text": "

                          Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)

                          "}, {"location": "API/models/br/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/br/#data-attributes", "title": "Data attributes", "text": "

                          Attributespipeline: PipelinePipeline of transforms.

                          Models that used automated feature scaling have the scaler added.

                          Tip

                          Use the plot_pipeline method to visualize the pipeline.

                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                          "}, {"location": "API/models/br/#utility-attributes", "title": "Utility attributes", "text": "

                          Attributesname: strName of the model.

                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                          • [param_name]: Parameter value used in this trial.
                          • estimator: Estimator used in this trial.
                          • [metric_name]: Metric score of the trial.
                          • [best_metric_name]: Best score so far in this study.
                          • time_trial: Duration of the trial.
                          • time_ht: Duration of the hyperparameter tuning.
                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                            All durations are in seconds. Possible values include:

                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                            • time_ht: Duration of the hyperparameter tuning.
                            • [metric]_train: Metric score on the train set.
                            • [metric]_test: Metric score on the test set.
                            • time_fit: Duration of the model fitting on the train set.
                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                            • time_bootstrap: Duration of the bootstrapping.
                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                              "}, {"location": "API/models/br/#methods", "title": "Methods", "text": "

                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                              reset: bool, default=False Whether to start a new run or continue the existing one.

                              method calibrate(**kwargs)[source]Calibrate the model.

                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                              Parametersrows: int, default=1 Number of plots in length.

                              cols: int, default=2 Number of plots in width.

                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                              title: str, dict or None, default=None Title for the plot.

                              • If None, no title is shown.
                              • If str, text for the title.
                              • If dict, title configuration.

                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                              • If None: No legend is shown.
                              • If str: Location where to show the legend.
                              • If dict: Legend configuration.

                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                              display: bool, default=True Whether to render the plot.

                              Yieldsgo.Figure Plot object.

                              method clear()[source]Reset attributes and clear cache from the model.

                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                              • In-training validation scores
                              • Cached predictions.
                              • Shap values
                              • App instance
                              • Dashboard instance
                              • Calculated holdout data sets

                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                              Note

                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                              Returnspd.DataFrame Overview of the results.

                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                              Read more in the user guide.

                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                              Tip

                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                              • The task is binary or multilabel classification.
                              • The model has a predict_proba method.
                              • The metric evaluates predicted probabilities.

                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                              Returnspd.Series Scores of the model.

                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                              method fit(X=None, y=None)[source]Fit and validate the model.

                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                              Warning

                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                              reset: bool, default=False Whether to start a new study or continue the existing one.

                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                              • If None: y is ignored.
                              • If int: Position of the target column in X.
                              • If str: Name of the target column in X.
                              • If dict: Name of the target column and sequence of values.
                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                              • If dataframe: Target columns for multioutput tasks.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsdataframe Original feature set. Only returned if provided.

                              series or dataframe Original target column. Only returned if provided.

                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                              Read more in the user guide.

                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                              Read more in the user guide.

                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                              Read more in the user guide.

                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                              stage: str, default=\"None\" New desired stage for the model.

                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                              Read more in the user guide.

                              Info

                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                              • If None: X must be a selection of rows in the dataset.
                              • If int: Position of the target column in X.
                              • If str: Name of the target column in X.
                              • If dict: Name of the target column and sequence of values.
                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                              • If dataframe: Target columns for multioutput tasks.

                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsfloat Metric score of X with respect to y.

                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                              Tip

                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                              port: int, default=8000 Port for HTTP server.

                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                              • If None: y is ignored.
                              • If int: Position of the target column in X.
                              • If str: Name of the target column in X.
                              • If dict: Name of the target column and sequence of values.
                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                              • If dataframe: Target columns for multioutput tasks.

                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                              Returnsdataframe Transformed feature set. Only returned if provided.

                              series or dataframe Transformed target column. Only returned if provided.

                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                              Recursively update the structure of the original layout with the values in the arguments.

                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                              Recursively update the structure of the original traces with the values in the arguments.

                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                              "}, {"location": "API/models/catb/", "title": "CatBoost", "text": "

                              CatB needs scaling accept sparse allows validation supports acceleration

                              CatBoost is a machine learning method based on gradient boosting over decision trees. Main advantages of CatBoost:

                              • Superior quality when compared with other GBDT models on many datasets.
                              • Best in class prediction speed.

                              Corresponding estimators are:

                              • CatBoostClassifier for classification tasks.
                              • CatBoostRegressor for regression tasks.

                              Read more in CatBoost's documentation.

                              Warning

                              • CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the use_best_model=False parameter to avoid this behavior or use a holdout set to evaluate the final estimator.
                              • In-training validation and pruning are disabled when device=\"gpu\".

                              Note

                              ATOM uses CatBoost's n_estimators parameter instead of iterations to indicate the number of trees to fit. This is done to have consistent naming with the XGBoost and LightGBM models.

                              See Also

                              GradientBoostingMachine Gradient Boosting Machine.

                              LightGBM Light Gradient Boosting Machine.

                              XGBoost Extreme Gradient Boosting.

                              "}, {"location": "API/models/catb/#example", "title": "Example", "text": "
                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CatB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CatB\nMetric: f1\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 14.218s\n-------------------------------------------------\nTime: 14.218s\n\n\nFinal results ==================== >>\nTotal time: 14.221s\n-------------------------------------\nCatBoost --> f1: 0.9655\n
                              "}, {"location": "API/models/catb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)

                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)

                              "}, {"location": "API/models/catb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catb/#data-attributes", "title": "Data attributes", "text": "

                              Attributespipeline: PipelinePipeline of transforms.

                              Models that used automated feature scaling have the scaler added.

                              Tip

                              Use the plot_pipeline method to visualize the pipeline.

                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                              "}, {"location": "API/models/catb/#utility-attributes", "title": "Utility attributes", "text": "

                              Attributesname: strName of the model.

                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                              • [param_name]: Parameter value used in this trial.
                              • estimator: Estimator used in this trial.
                              • [metric_name]: Metric score of the trial.
                              • [best_metric_name]: Best score so far in this study.
                              • time_trial: Duration of the trial.
                              • time_ht: Duration of the hyperparameter tuning.
                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                All durations are in seconds. Possible values include:

                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                • time_ht: Duration of the hyperparameter tuning.
                                • [metric]_train: Metric score on the train set.
                                • [metric]_test: Metric score on the test set.
                                • time_fit: Duration of the model fitting on the train set.
                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                • time_bootstrap: Duration of the bootstrapping.
                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                  "}, {"location": "API/models/catb/#methods", "title": "Methods", "text": "

                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                  method calibrate(**kwargs)[source]Calibrate the model.

                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                  Parametersrows: int, default=1 Number of plots in length.

                                  cols: int, default=2 Number of plots in width.

                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                  title: str, dict or None, default=None Title for the plot.

                                  • If None, no title is shown.
                                  • If str, text for the title.
                                  • If dict, title configuration.

                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                  • If None: No legend is shown.
                                  • If str: Location where to show the legend.
                                  • If dict: Legend configuration.

                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                  display: bool, default=True Whether to render the plot.

                                  Yieldsgo.Figure Plot object.

                                  method clear()[source]Reset attributes and clear cache from the model.

                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                  • In-training validation scores
                                  • Cached predictions.
                                  • Shap values
                                  • App instance
                                  • Dashboard instance
                                  • Calculated holdout data sets

                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                  Note

                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                  Returnspd.DataFrame Overview of the results.

                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                  Read more in the user guide.

                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                  Tip

                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                  • The task is binary or multilabel classification.
                                  • The model has a predict_proba method.
                                  • The metric evaluates predicted probabilities.

                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                  Returnspd.Series Scores of the model.

                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                  Warning

                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                  • If None: y is ignored.
                                  • If int: Position of the target column in X.
                                  • If str: Name of the target column in X.
                                  • If dict: Name of the target column and sequence of values.
                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                  • If dataframe: Target columns for multioutput tasks.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsdataframe Original feature set. Only returned if provided.

                                  series or dataframe Original target column. Only returned if provided.

                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                  Read more in the user guide.

                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                  Read more in the user guide.

                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                  Read more in the user guide.

                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                  stage: str, default=\"None\" New desired stage for the model.

                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                  Read more in the user guide.

                                  Info

                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                  • If None: X must be a selection of rows in the dataset.
                                  • If int: Position of the target column in X.
                                  • If str: Name of the target column in X.
                                  • If dict: Name of the target column and sequence of values.
                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                  • If dataframe: Target columns for multioutput tasks.

                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsfloat Metric score of X with respect to y.

                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                  Tip

                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                  port: int, default=8000 Port for HTTP server.

                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                  • If None: y is ignored.
                                  • If int: Position of the target column in X.
                                  • If str: Name of the target column in X.
                                  • If dict: Name of the target column and sequence of values.
                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                  • If dataframe: Target columns for multioutput tasks.

                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                  series or dataframe Transformed target column. Only returned if provided.

                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                  Recursively update the structure of the original layout with the values in the arguments.

                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                  Recursively update the structure of the original traces with the values in the arguments.

                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                  "}, {"location": "API/models/catnb/", "title": "CategoricalNB", "text": "

                                  CatNB accept sparse supports acceleration

                                  Categorical Naive Bayes implements the Naive Bayes algorithm for categorical features.

                                  Corresponding estimators are:

                                  • CategoricalNB for classification tasks.

                                  Read more in sklearn's documentation.

                                  See Also

                                  BernoulliNB Bernoulli Naive Bayes.

                                  ComplementNB Complement Naive Bayes.

                                  GaussianNB Gaussian Naive Bayes.

                                  "}, {"location": "API/models/catnb/#example", "title": "Example", "text": "
                                  >>> from atom import ATOMClassifier\n>>> import numpy as np\n\n>>> X = np.random.randint(5, size=(100, 100))\n>>> y = np.random.randint(2, size=100)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CatNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CatNB\nMetric: f1\n\n\nResults for CategoricalNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.4444\nTime elapsed: 0.029s\n-------------------------------------------------\nTime: 0.029s\n\n\nFinal results ==================== >>\nTotal time: 0.032s\n-------------------------------------\nCategoricalNB --> f1: 0.4444 ~\n
                                  "}, {"location": "API/models/catnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                  ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                  ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                  "}, {"location": "API/models/catnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catnb/#data-attributes", "title": "Data attributes", "text": "

                                  Attributespipeline: PipelinePipeline of transforms.

                                  Models that used automated feature scaling have the scaler added.

                                  Tip

                                  Use the plot_pipeline method to visualize the pipeline.

                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                  "}, {"location": "API/models/catnb/#utility-attributes", "title": "Utility attributes", "text": "

                                  Attributesname: strName of the model.

                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                  • [param_name]: Parameter value used in this trial.
                                  • estimator: Estimator used in this trial.
                                  • [metric_name]: Metric score of the trial.
                                  • [best_metric_name]: Best score so far in this study.
                                  • time_trial: Duration of the trial.
                                  • time_ht: Duration of the hyperparameter tuning.
                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                    All durations are in seconds. Possible values include:

                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                    • time_ht: Duration of the hyperparameter tuning.
                                    • [metric]_train: Metric score on the train set.
                                    • [metric]_test: Metric score on the test set.
                                    • time_fit: Duration of the model fitting on the train set.
                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                    • time_bootstrap: Duration of the bootstrapping.
                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                      "}, {"location": "API/models/catnb/#methods", "title": "Methods", "text": "

                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                      method calibrate(**kwargs)[source]Calibrate the model.

                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                      Parametersrows: int, default=1 Number of plots in length.

                                      cols: int, default=2 Number of plots in width.

                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                      title: str, dict or None, default=None Title for the plot.

                                      • If None, no title is shown.
                                      • If str, text for the title.
                                      • If dict, title configuration.

                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                      • If None: No legend is shown.
                                      • If str: Location where to show the legend.
                                      • If dict: Legend configuration.

                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                      display: bool, default=True Whether to render the plot.

                                      Yieldsgo.Figure Plot object.

                                      method clear()[source]Reset attributes and clear cache from the model.

                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                      • In-training validation scores
                                      • Cached predictions.
                                      • Shap values
                                      • App instance
                                      • Dashboard instance
                                      • Calculated holdout data sets

                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                      Note

                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                      Returnspd.DataFrame Overview of the results.

                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                      Read more in the user guide.

                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                      Tip

                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                      • The task is binary or multilabel classification.
                                      • The model has a predict_proba method.
                                      • The metric evaluates predicted probabilities.

                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                      Returnspd.Series Scores of the model.

                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                      Warning

                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                      • If None: y is ignored.
                                      • If int: Position of the target column in X.
                                      • If str: Name of the target column in X.
                                      • If dict: Name of the target column and sequence of values.
                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                      • If dataframe: Target columns for multioutput tasks.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsdataframe Original feature set. Only returned if provided.

                                      series or dataframe Original target column. Only returned if provided.

                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                      Read more in the user guide.

                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                      Read more in the user guide.

                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                      Read more in the user guide.

                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                      stage: str, default=\"None\" New desired stage for the model.

                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                      Read more in the user guide.

                                      Info

                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                      • If None: X must be a selection of rows in the dataset.
                                      • If int: Position of the target column in X.
                                      • If str: Name of the target column in X.
                                      • If dict: Name of the target column and sequence of values.
                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                      • If dataframe: Target columns for multioutput tasks.

                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsfloat Metric score of X with respect to y.

                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                      Tip

                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                      port: int, default=8000 Port for HTTP server.

                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                      • If None: y is ignored.
                                      • If int: Position of the target column in X.
                                      • If str: Name of the target column in X.
                                      • If dict: Name of the target column and sequence of values.
                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                      • If dataframe: Target columns for multioutput tasks.

                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                      series or dataframe Transformed target column. Only returned if provided.

                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                      Recursively update the structure of the original layout with the values in the arguments.

                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                      Recursively update the structure of the original traces with the values in the arguments.

                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                      "}, {"location": "API/models/cnb/", "title": "ComplementNB", "text": "

                                      CNB accept sparse supports acceleration

                                      The Complement Naive Bayes classifier was designed to correct the \"severe assumptions\" made by the standard MultinomialNB classifier. It is particularly suited for imbalanced datasets.

                                      Corresponding estimators are:

                                      • ComplementNB for classification tasks.

                                      Read more in sklearn's documentation.

                                      See Also

                                      BernoulliNB Bernoulli Naive Bayes.

                                      CategoricalNB Categorical Naive Bayes.

                                      MultinomialNB Multinomial Naive Bayes.

                                      "}, {"location": "API/models/cnb/#example", "title": "Example", "text": "
                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CNB\nMetric: f1\n\n\nResults for ComplementNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9221\nTest evaluation --> f1: 0.9128\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.023s\n-------------------------------------\nComplementNB --> f1: 0.9128\n
                                      "}, {"location": "API/models/cnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))

                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))

                                      "}, {"location": "API/models/cnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/cnb/#data-attributes", "title": "Data attributes", "text": "

                                      Attributespipeline: PipelinePipeline of transforms.

                                      Models that used automated feature scaling have the scaler added.

                                      Tip

                                      Use the plot_pipeline method to visualize the pipeline.

                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                      "}, {"location": "API/models/cnb/#utility-attributes", "title": "Utility attributes", "text": "

                                      Attributesname: strName of the model.

                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                      • [param_name]: Parameter value used in this trial.
                                      • estimator: Estimator used in this trial.
                                      • [metric_name]: Metric score of the trial.
                                      • [best_metric_name]: Best score so far in this study.
                                      • time_trial: Duration of the trial.
                                      • time_ht: Duration of the hyperparameter tuning.
                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                        All durations are in seconds. Possible values include:

                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                        • time_ht: Duration of the hyperparameter tuning.
                                        • [metric]_train: Metric score on the train set.
                                        • [metric]_test: Metric score on the test set.
                                        • time_fit: Duration of the model fitting on the train set.
                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                        • time_bootstrap: Duration of the bootstrapping.
                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                          "}, {"location": "API/models/cnb/#methods", "title": "Methods", "text": "

                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                          method calibrate(**kwargs)[source]Calibrate the model.

                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                          Parametersrows: int, default=1 Number of plots in length.

                                          cols: int, default=2 Number of plots in width.

                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                          title: str, dict or None, default=None Title for the plot.

                                          • If None, no title is shown.
                                          • If str, text for the title.
                                          • If dict, title configuration.

                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                          • If None: No legend is shown.
                                          • If str: Location where to show the legend.
                                          • If dict: Legend configuration.

                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                          display: bool, default=True Whether to render the plot.

                                          Yieldsgo.Figure Plot object.

                                          method clear()[source]Reset attributes and clear cache from the model.

                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                          • In-training validation scores
                                          • Cached predictions.
                                          • Shap values
                                          • App instance
                                          • Dashboard instance
                                          • Calculated holdout data sets

                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                          Note

                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                          Returnspd.DataFrame Overview of the results.

                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                          Read more in the user guide.

                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                          Tip

                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                          • The task is binary or multilabel classification.
                                          • The model has a predict_proba method.
                                          • The metric evaluates predicted probabilities.

                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                          Returnspd.Series Scores of the model.

                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                          Warning

                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                          • If None: y is ignored.
                                          • If int: Position of the target column in X.
                                          • If str: Name of the target column in X.
                                          • If dict: Name of the target column and sequence of values.
                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                          • If dataframe: Target columns for multioutput tasks.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsdataframe Original feature set. Only returned if provided.

                                          series or dataframe Original target column. Only returned if provided.

                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                          Read more in the user guide.

                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                          Read more in the user guide.

                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                          Read more in the user guide.

                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                          stage: str, default=\"None\" New desired stage for the model.

                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                          Read more in the user guide.

                                          Info

                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                          • If None: X must be a selection of rows in the dataset.
                                          • If int: Position of the target column in X.
                                          • If str: Name of the target column in X.
                                          • If dict: Name of the target column and sequence of values.
                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                          • If dataframe: Target columns for multioutput tasks.

                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsfloat Metric score of X with respect to y.

                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                          Tip

                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                          port: int, default=8000 Port for HTTP server.

                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                          • If None: y is ignored.
                                          • If int: Position of the target column in X.
                                          • If str: Name of the target column in X.
                                          • If dict: Name of the target column and sequence of values.
                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                          • If dataframe: Target columns for multioutput tasks.

                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                          series or dataframe Transformed target column. Only returned if provided.

                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                          Recursively update the structure of the original layout with the values in the arguments.

                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                          Recursively update the structure of the original traces with the values in the arguments.

                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                          "}, {"location": "API/models/dummy/", "title": "Dummy", "text": "

                                          Dummy

                                          When doing supervised learning, a simple sanity check consists of comparing one's estimator against simple rules of thumb. The prediction methods completely ignore the input data. Do not use this model for real problems. Use it only as a simple baseline to compare with other models.

                                          Corresponding estimators are:

                                          • DummyClassifier for classification tasks.
                                          • DummyRegressor for regression tasks.

                                          Read more in sklearn's documentation.

                                          See Also

                                          DecisionTree Single Decision Tree.

                                          ExtraTree Extremely Randomized Tree.

                                          NaiveForecaster Naive Forecaster.

                                          "}, {"location": "API/models/dummy/#example", "title": "Example", "text": "
                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Dummy\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Dummy\nMetric: f1\n\n\nResults for Dummy:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7709\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.021s\n-------------------------------------\nDummy --> f1: 0.7717\n
                                          "}, {"location": "API/models/dummy/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                          ParametersstrategyCategoricalDistribution(choices=('most_frequent', 'prior', 'stratified', 'uniform'))

                                          ParametersstrategyCategoricalDistribution(choices=('mean', 'median', 'quantile'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                          "}, {"location": "API/models/dummy/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/dummy/#data-attributes", "title": "Data attributes", "text": "

                                          Attributespipeline: PipelinePipeline of transforms.

                                          Models that used automated feature scaling have the scaler added.

                                          Tip

                                          Use the plot_pipeline method to visualize the pipeline.

                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                          "}, {"location": "API/models/dummy/#utility-attributes", "title": "Utility attributes", "text": "

                                          Attributesname: strName of the model.

                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                          • [param_name]: Parameter value used in this trial.
                                          • estimator: Estimator used in this trial.
                                          • [metric_name]: Metric score of the trial.
                                          • [best_metric_name]: Best score so far in this study.
                                          • time_trial: Duration of the trial.
                                          • time_ht: Duration of the hyperparameter tuning.
                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                            All durations are in seconds. Possible values include:

                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                            • time_ht: Duration of the hyperparameter tuning.
                                            • [metric]_train: Metric score on the train set.
                                            • [metric]_test: Metric score on the test set.
                                            • time_fit: Duration of the model fitting on the train set.
                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                            • time_bootstrap: Duration of the bootstrapping.
                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                              "}, {"location": "API/models/dummy/#methods", "title": "Methods", "text": "

                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                              method calibrate(**kwargs)[source]Calibrate the model.

                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                              Parametersrows: int, default=1 Number of plots in length.

                                              cols: int, default=2 Number of plots in width.

                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                              title: str, dict or None, default=None Title for the plot.

                                              • If None, no title is shown.
                                              • If str, text for the title.
                                              • If dict, title configuration.

                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                              • If None: No legend is shown.
                                              • If str: Location where to show the legend.
                                              • If dict: Legend configuration.

                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                              display: bool, default=True Whether to render the plot.

                                              Yieldsgo.Figure Plot object.

                                              method clear()[source]Reset attributes and clear cache from the model.

                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                              • In-training validation scores
                                              • Cached predictions.
                                              • Shap values
                                              • App instance
                                              • Dashboard instance
                                              • Calculated holdout data sets

                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                              Note

                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                              Returnspd.DataFrame Overview of the results.

                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                              Read more in the user guide.

                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                              Tip

                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                              • The task is binary or multilabel classification.
                                              • The model has a predict_proba method.
                                              • The metric evaluates predicted probabilities.

                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                              Returnspd.Series Scores of the model.

                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                              Warning

                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                              • If None: y is ignored.
                                              • If int: Position of the target column in X.
                                              • If str: Name of the target column in X.
                                              • If dict: Name of the target column and sequence of values.
                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                              • If dataframe: Target columns for multioutput tasks.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsdataframe Original feature set. Only returned if provided.

                                              series or dataframe Original target column. Only returned if provided.

                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                              Read more in the user guide.

                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                              Read more in the user guide.

                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                              Read more in the user guide.

                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                              stage: str, default=\"None\" New desired stage for the model.

                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                              Read more in the user guide.

                                              Info

                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                              • If None: X must be a selection of rows in the dataset.
                                              • If int: Position of the target column in X.
                                              • If str: Name of the target column in X.
                                              • If dict: Name of the target column and sequence of values.
                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                              • If dataframe: Target columns for multioutput tasks.

                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsfloat Metric score of X with respect to y.

                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                              Tip

                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                              port: int, default=8000 Port for HTTP server.

                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                              • If None: y is ignored.
                                              • If int: Position of the target column in X.
                                              • If str: Name of the target column in X.
                                              • If dict: Name of the target column and sequence of values.
                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                              • If dataframe: Target columns for multioutput tasks.

                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                              series or dataframe Transformed target column. Only returned if provided.

                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                              Recursively update the structure of the original layout with the values in the arguments.

                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                              Recursively update the structure of the original traces with the values in the arguments.

                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                              "}, {"location": "API/models/en/", "title": "ElasticNet", "text": "

                                              EN needs scaling accept sparse supports acceleration

                                              Linear least squares with l1 and l2 regularization.

                                              Corresponding estimators are:

                                              • ElasticNet for regression tasks.

                                              Read more in sklearn's documentation.

                                              See Also

                                              Lasso Linear Regression with lasso regularization.

                                              OrdinaryLeastSquares Linear Regression.

                                              Ridge Linear least squares with l2 regularization.

                                              "}, {"location": "API/models/en/#example", "title": "Example", "text": "
                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"EN\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: EN\nMetric: r2\n\n\nResults for ElasticNet:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.2061\nTest evaluation --> r2: 0.2016\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nElasticNet --> r2: 0.2016\n
                                              "}, {"location": "API/models/en/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                              cpugpu

                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                              "}, {"location": "API/models/en/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/en/#data-attributes", "title": "Data attributes", "text": "

                                              Attributespipeline: PipelinePipeline of transforms.

                                              Models that used automated feature scaling have the scaler added.

                                              Tip

                                              Use the plot_pipeline method to visualize the pipeline.

                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                              "}, {"location": "API/models/en/#utility-attributes", "title": "Utility attributes", "text": "

                                              Attributesname: strName of the model.

                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                              • [param_name]: Parameter value used in this trial.
                                              • estimator: Estimator used in this trial.
                                              • [metric_name]: Metric score of the trial.
                                              • [best_metric_name]: Best score so far in this study.
                                              • time_trial: Duration of the trial.
                                              • time_ht: Duration of the hyperparameter tuning.
                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                All durations are in seconds. Possible values include:

                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                • time_ht: Duration of the hyperparameter tuning.
                                                • [metric]_train: Metric score on the train set.
                                                • [metric]_test: Metric score on the test set.
                                                • time_fit: Duration of the model fitting on the train set.
                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                • time_bootstrap: Duration of the bootstrapping.
                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                  "}, {"location": "API/models/en/#methods", "title": "Methods", "text": "

                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                  Parametersrows: int, default=1 Number of plots in length.

                                                  cols: int, default=2 Number of plots in width.

                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                  title: str, dict or None, default=None Title for the plot.

                                                  • If None, no title is shown.
                                                  • If str, text for the title.
                                                  • If dict, title configuration.

                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                  • If None: No legend is shown.
                                                  • If str: Location where to show the legend.
                                                  • If dict: Legend configuration.

                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                  display: bool, default=True Whether to render the plot.

                                                  Yieldsgo.Figure Plot object.

                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                  • In-training validation scores
                                                  • Cached predictions.
                                                  • Shap values
                                                  • App instance
                                                  • Dashboard instance
                                                  • Calculated holdout data sets

                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                  Note

                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                  Returnspd.DataFrame Overview of the results.

                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                  Read more in the user guide.

                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                  Tip

                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                  • The task is binary or multilabel classification.
                                                  • The model has a predict_proba method.
                                                  • The metric evaluates predicted probabilities.

                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                  Returnspd.Series Scores of the model.

                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                  Warning

                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                  • If None: y is ignored.
                                                  • If int: Position of the target column in X.
                                                  • If str: Name of the target column in X.
                                                  • If dict: Name of the target column and sequence of values.
                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                  • If dataframe: Target columns for multioutput tasks.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                  series or dataframe Original target column. Only returned if provided.

                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                  Read more in the user guide.

                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                  Read more in the user guide.

                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                  Read more in the user guide.

                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                  stage: str, default=\"None\" New desired stage for the model.

                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                  Read more in the user guide.

                                                  Info

                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                  • If None: X must be a selection of rows in the dataset.
                                                  • If int: Position of the target column in X.
                                                  • If str: Name of the target column in X.
                                                  • If dict: Name of the target column and sequence of values.
                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                  • If dataframe: Target columns for multioutput tasks.

                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsfloat Metric score of X with respect to y.

                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                  Tip

                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                  port: int, default=8000 Port for HTTP server.

                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                  • If None: y is ignored.
                                                  • If int: Position of the target column in X.
                                                  • If str: Name of the target column in X.
                                                  • If dict: Name of the target column and sequence of values.
                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                  • If dataframe: Target columns for multioutput tasks.

                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                  series or dataframe Transformed target column. Only returned if provided.

                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                  "}, {"location": "API/models/es/", "title": "ExponentialSmoothing", "text": "

                                                  ES native multioutput

                                                  Holt-Winters exponential smoothing forecaster. The default settings use simple exponential smoothing, without trend and seasonality components.

                                                  Corresponding estimators are:

                                                  • ExponentialSmoothing for forecasting tasks.

                                                  See Also

                                                  ARIMA Autoregressive Integrated Moving Average Model.

                                                  ETS ETS model with automatic fitting capabilities.

                                                  PolynomialTrend Polynomial Trend forecaster.

                                                  "}, {"location": "API/models/es/#example", "title": "Example", "text": "
                                                  >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"ES\", verbose=2)\n\n\nTraining ========================= >>\nModels: ES\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0864\nTest evaluation --> mape: -0.2303\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.020s\n-------------------------------------\nExponentialSmoothing --> mape: -0.2303\n
                                                  "}, {"location": "API/models/es/#hyperparameters", "title": "Hyperparameters", "text": "

                                                  ParameterstrendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(4, 6, 7, 12, None))use_boxcoxCategoricalDistribution(choices=(True, False))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))methodCategoricalDistribution(choices=('L-BFGS-B', 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'bh', 'ls'))

                                                  "}, {"location": "API/models/es/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/es/#data-attributes", "title": "Data attributes", "text": "

                                                  Attributespipeline: PipelinePipeline of transforms.

                                                  Models that used automated feature scaling have the scaler added.

                                                  Tip

                                                  Use the plot_pipeline method to visualize the pipeline.

                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                  "}, {"location": "API/models/es/#utility-attributes", "title": "Utility attributes", "text": "

                                                  Attributesname: strName of the model.

                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                  • [param_name]: Parameter value used in this trial.
                                                  • estimator: Estimator used in this trial.
                                                  • [metric_name]: Metric score of the trial.
                                                  • [best_metric_name]: Best score so far in this study.
                                                  • time_trial: Duration of the trial.
                                                  • time_ht: Duration of the hyperparameter tuning.
                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                    All durations are in seconds. Possible values include:

                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                    • time_ht: Duration of the hyperparameter tuning.
                                                    • [metric]_train: Metric score on the train set.
                                                    • [metric]_test: Metric score on the test set.
                                                    • time_fit: Duration of the model fitting on the train set.
                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                    • time_bootstrap: Duration of the bootstrapping.
                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                      "}, {"location": "API/models/es/#methods", "title": "Methods", "text": "

                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                      Parametersrows: int, default=1 Number of plots in length.

                                                      cols: int, default=2 Number of plots in width.

                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                      title: str, dict or None, default=None Title for the plot.

                                                      • If None, no title is shown.
                                                      • If str, text for the title.
                                                      • If dict, title configuration.

                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                      • If None: No legend is shown.
                                                      • If str: Location where to show the legend.
                                                      • If dict: Legend configuration.

                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                      display: bool, default=True Whether to render the plot.

                                                      Yieldsgo.Figure Plot object.

                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                      • In-training validation scores
                                                      • Cached predictions.
                                                      • Shap values
                                                      • App instance
                                                      • Dashboard instance
                                                      • Calculated holdout data sets

                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                      Note

                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                      Returnspd.DataFrame Overview of the results.

                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                      Tip

                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                      • The task is binary or multilabel classification.
                                                      • The model has a predict_proba method.
                                                      • The metric evaluates predicted probabilities.

                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                      Returnspd.Series Scores of the model.

                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                      Warning

                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                      • If None: y is ignored.
                                                      • If int: Position of the target column in X.
                                                      • If str: Name of the target column in X.
                                                      • If dict: Name of the target column and sequence of values.
                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                      • If dataframe: Target columns for multioutput tasks.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                      series or dataframe Original target column. Only returned if provided.

                                                      method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                      Read more in the user guide.

                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                      method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                      Read more in the user guide.

                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                      method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                      Read more in the user guide.

                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnssktime.proba.Normal Predicted distribution.

                                                      method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                      Read more in the user guide.

                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                      method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                      Read more in the user guide.

                                                      Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                      method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                      Read more in the user guide.

                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                      stage: str, default=\"None\" New desired stage for the model.

                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                      method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                      Read more in the user guide.

                                                      Info

                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                      Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                      fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsfloat Metric score of y with respect to a ground truth.

                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                      Tip

                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                      port: int, default=8000 Port for HTTP server.

                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                      • If None: y is ignored.
                                                      • If int: Position of the target column in X.
                                                      • If str: Name of the target column in X.
                                                      • If dict: Name of the target column and sequence of values.
                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                      • If dataframe: Target columns for multioutput tasks.

                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                      series or dataframe Transformed target column. Only returned if provided.

                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                      "}, {"location": "API/models/et/", "title": "ExtraTrees", "text": "

                                                      ET accept sparse native multilabel native multioutput

                                                      Extra-Trees use a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

                                                      Corresponding estimators are:

                                                      • ExtraTreesClassifier for classification tasks.
                                                      • ExtraTreesRegressor for regression tasks.

                                                      Read more in sklearn's documentation.

                                                      See Also

                                                      DecisionTree Single Decision Tree.

                                                      ExtraTree Extremely Randomized Tree.

                                                      RandomForest Random Forest.

                                                      "}, {"location": "API/models/et/#example", "title": "Example", "text": "
                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"ET\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: ET\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.110s\n\n\nFinal results ==================== >>\nTotal time: 0.112s\n-------------------------------------\nExtraTrees --> f1: 0.9655\n
                                                      "}, {"location": "API/models/et/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                      "}, {"location": "API/models/et/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/et/#data-attributes", "title": "Data attributes", "text": "

                                                      Attributespipeline: PipelinePipeline of transforms.

                                                      Models that used automated feature scaling have the scaler added.

                                                      Tip

                                                      Use the plot_pipeline method to visualize the pipeline.

                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                      "}, {"location": "API/models/et/#utility-attributes", "title": "Utility attributes", "text": "

                                                      Attributesname: strName of the model.

                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                      • [param_name]: Parameter value used in this trial.
                                                      • estimator: Estimator used in this trial.
                                                      • [metric_name]: Metric score of the trial.
                                                      • [best_metric_name]: Best score so far in this study.
                                                      • time_trial: Duration of the trial.
                                                      • time_ht: Duration of the hyperparameter tuning.
                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                        All durations are in seconds. Possible values include:

                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                        • time_ht: Duration of the hyperparameter tuning.
                                                        • [metric]_train: Metric score on the train set.
                                                        • [metric]_test: Metric score on the test set.
                                                        • time_fit: Duration of the model fitting on the train set.
                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                        • time_bootstrap: Duration of the bootstrapping.
                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                          "}, {"location": "API/models/et/#methods", "title": "Methods", "text": "

                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                          Parametersrows: int, default=1 Number of plots in length.

                                                          cols: int, default=2 Number of plots in width.

                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                          title: str, dict or None, default=None Title for the plot.

                                                          • If None, no title is shown.
                                                          • If str, text for the title.
                                                          • If dict, title configuration.

                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                          • If None: No legend is shown.
                                                          • If str: Location where to show the legend.
                                                          • If dict: Legend configuration.

                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                          display: bool, default=True Whether to render the plot.

                                                          Yieldsgo.Figure Plot object.

                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                          • In-training validation scores
                                                          • Cached predictions.
                                                          • Shap values
                                                          • App instance
                                                          • Dashboard instance
                                                          • Calculated holdout data sets

                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                          Note

                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                          Returnspd.DataFrame Overview of the results.

                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                          Read more in the user guide.

                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                          Tip

                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                          • The task is binary or multilabel classification.
                                                          • The model has a predict_proba method.
                                                          • The metric evaluates predicted probabilities.

                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                          Returnspd.Series Scores of the model.

                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                          Warning

                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                          • If None: y is ignored.
                                                          • If int: Position of the target column in X.
                                                          • If str: Name of the target column in X.
                                                          • If dict: Name of the target column and sequence of values.
                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                          • If dataframe: Target columns for multioutput tasks.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                          series or dataframe Original target column. Only returned if provided.

                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                          Read more in the user guide.

                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                          Read more in the user guide.

                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                          Read more in the user guide.

                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                          stage: str, default=\"None\" New desired stage for the model.

                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                          Read more in the user guide.

                                                          Info

                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                          • If None: X must be a selection of rows in the dataset.
                                                          • If int: Position of the target column in X.
                                                          • If str: Name of the target column in X.
                                                          • If dict: Name of the target column and sequence of values.
                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                          • If dataframe: Target columns for multioutput tasks.

                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsfloat Metric score of X with respect to y.

                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                          Tip

                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                          port: int, default=8000 Port for HTTP server.

                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                          • If None: y is ignored.
                                                          • If int: Position of the target column in X.
                                                          • If str: Name of the target column in X.
                                                          • If dict: Name of the target column and sequence of values.
                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                          • If dataframe: Target columns for multioutput tasks.

                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                          series or dataframe Transformed target column. Only returned if provided.

                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                          "}, {"location": "API/models/etree/", "title": "ExtraTree", "text": "

                                                          ETree accept sparse native multilabel native multioutput

                                                          Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the max_features randomly selected features and the best split among those is chosen. When max_features is set 1, this amounts to building a totally random decision tree.

                                                          Corresponding estimators are:

                                                          • ExtraTreeClassifier for classification tasks.
                                                          • ExtraTreeRegressor for regression tasks.

                                                          Read more in sklearn's documentation.

                                                          See Also

                                                          DecisionTree Single Decision Tree.

                                                          ExtraTrees Extremely Randomized Trees.

                                                          RandomForest Random Forest.

                                                          "}, {"location": "API/models/etree/#example", "title": "Example", "text": "
                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"ETree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: ETree\nMetric: f1\n\n\nResults for ExtraTree:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9241\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.024s\n-------------------------------------\nExtraTree --> f1: 0.9241\n
                                                          "}, {"location": "API/models/etree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                          ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                          ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                          "}, {"location": "API/models/etree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/etree/#data-attributes", "title": "Data attributes", "text": "

                                                          Attributespipeline: PipelinePipeline of transforms.

                                                          Models that used automated feature scaling have the scaler added.

                                                          Tip

                                                          Use the plot_pipeline method to visualize the pipeline.

                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                          "}, {"location": "API/models/etree/#utility-attributes", "title": "Utility attributes", "text": "

                                                          Attributesname: strName of the model.

                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                          • [param_name]: Parameter value used in this trial.
                                                          • estimator: Estimator used in this trial.
                                                          • [metric_name]: Metric score of the trial.
                                                          • [best_metric_name]: Best score so far in this study.
                                                          • time_trial: Duration of the trial.
                                                          • time_ht: Duration of the hyperparameter tuning.
                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                            All durations are in seconds. Possible values include:

                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                            • time_ht: Duration of the hyperparameter tuning.
                                                            • [metric]_train: Metric score on the train set.
                                                            • [metric]_test: Metric score on the test set.
                                                            • time_fit: Duration of the model fitting on the train set.
                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                            • time_bootstrap: Duration of the bootstrapping.
                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                              "}, {"location": "API/models/etree/#methods", "title": "Methods", "text": "

                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                              Parametersrows: int, default=1 Number of plots in length.

                                                              cols: int, default=2 Number of plots in width.

                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                              title: str, dict or None, default=None Title for the plot.

                                                              • If None, no title is shown.
                                                              • If str, text for the title.
                                                              • If dict, title configuration.

                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                              • If None: No legend is shown.
                                                              • If str: Location where to show the legend.
                                                              • If dict: Legend configuration.

                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                              display: bool, default=True Whether to render the plot.

                                                              Yieldsgo.Figure Plot object.

                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                              • In-training validation scores
                                                              • Cached predictions.
                                                              • Shap values
                                                              • App instance
                                                              • Dashboard instance
                                                              • Calculated holdout data sets

                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                              Note

                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                              Returnspd.DataFrame Overview of the results.

                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                              Read more in the user guide.

                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                              Tip

                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                              • The task is binary or multilabel classification.
                                                              • The model has a predict_proba method.
                                                              • The metric evaluates predicted probabilities.

                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                              Returnspd.Series Scores of the model.

                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                              Warning

                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                              • If None: y is ignored.
                                                              • If int: Position of the target column in X.
                                                              • If str: Name of the target column in X.
                                                              • If dict: Name of the target column and sequence of values.
                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                              • If dataframe: Target columns for multioutput tasks.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                              series or dataframe Original target column. Only returned if provided.

                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                              Read more in the user guide.

                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                              Read more in the user guide.

                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                              Read more in the user guide.

                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                              stage: str, default=\"None\" New desired stage for the model.

                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                              Read more in the user guide.

                                                              Info

                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                              • If None: X must be a selection of rows in the dataset.
                                                              • If int: Position of the target column in X.
                                                              • If str: Name of the target column in X.
                                                              • If dict: Name of the target column and sequence of values.
                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                              • If dataframe: Target columns for multioutput tasks.

                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsfloat Metric score of X with respect to y.

                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                              Tip

                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                              port: int, default=8000 Port for HTTP server.

                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                              • If None: y is ignored.
                                                              • If int: Position of the target column in X.
                                                              • If str: Name of the target column in X.
                                                              • If dict: Name of the target column and sequence of values.
                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                              • If dataframe: Target columns for multioutput tasks.

                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                              series or dataframe Transformed target column. Only returned if provided.

                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                              "}, {"location": "API/models/ets/", "title": "ETS", "text": "

                                                              ETS native multioutput

                                                              The ETS models are a family of time series models with an underlying state space model consisting of a level component, a trend component (T), a seasonal component (S), and an error term (E).

                                                              Corresponding estimators are:

                                                              • AutoETS for forecasting tasks.

                                                              See Also

                                                              ARIMA Autoregressive Integrated Moving Average Model.

                                                              ExponentialSmoothing Exponential Smoothing forecaster.

                                                              PolynomialTrend Polynomial Trend forecaster.

                                                              "}, {"location": "API/models/ets/#example", "title": "Example", "text": "
                                                              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"ETS\", verbose=2)\n\n\nTraining ========================= >>\nModels: ETS\nMetric: mape\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.022s\n-------------------------------------\nETS --> mape: -0.2305\n
                                                              "}, {"location": "API/models/ets/#hyperparameters", "title": "Hyperparameters", "text": "

                                                              ParameterserrorCategoricalDistribution(choices=('add', 'mul'))trendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(1, 4, 6, 7, 12))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))maxiterIntDistribution(high=2000, log=False, low=500, step=100)autoCategoricalDistribution(choices=(True, False))information_criterionCategoricalDistribution(choices=('aic', 'bic', 'aicc'))

                                                              "}, {"location": "API/models/ets/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ets/#data-attributes", "title": "Data attributes", "text": "

                                                              Attributespipeline: PipelinePipeline of transforms.

                                                              Models that used automated feature scaling have the scaler added.

                                                              Tip

                                                              Use the plot_pipeline method to visualize the pipeline.

                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                              "}, {"location": "API/models/ets/#utility-attributes", "title": "Utility attributes", "text": "

                                                              Attributesname: strName of the model.

                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                              • [param_name]: Parameter value used in this trial.
                                                              • estimator: Estimator used in this trial.
                                                              • [metric_name]: Metric score of the trial.
                                                              • [best_metric_name]: Best score so far in this study.
                                                              • time_trial: Duration of the trial.
                                                              • time_ht: Duration of the hyperparameter tuning.
                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                All durations are in seconds. Possible values include:

                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                • [metric]_train: Metric score on the train set.
                                                                • [metric]_test: Metric score on the test set.
                                                                • time_fit: Duration of the model fitting on the train set.
                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                  "}, {"location": "API/models/ets/#methods", "title": "Methods", "text": "

                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                  cols: int, default=2 Number of plots in width.

                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                  title: str, dict or None, default=None Title for the plot.

                                                                  • If None, no title is shown.
                                                                  • If str, text for the title.
                                                                  • If dict, title configuration.

                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                  • If None: No legend is shown.
                                                                  • If str: Location where to show the legend.
                                                                  • If dict: Legend configuration.

                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                  display: bool, default=True Whether to render the plot.

                                                                  Yieldsgo.Figure Plot object.

                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                  • In-training validation scores
                                                                  • Cached predictions.
                                                                  • Shap values
                                                                  • App instance
                                                                  • Dashboard instance
                                                                  • Calculated holdout data sets

                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                  Note

                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                  Returnspd.DataFrame Overview of the results.

                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                  Tip

                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                  • The task is binary or multilabel classification.
                                                                  • The model has a predict_proba method.
                                                                  • The metric evaluates predicted probabilities.

                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                  Returnspd.Series Scores of the model.

                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                  Warning

                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                  • If None: y is ignored.
                                                                  • If int: Position of the target column in X.
                                                                  • If str: Name of the target column in X.
                                                                  • If dict: Name of the target column and sequence of values.
                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                  series or dataframe Original target column. Only returned if provided.

                                                                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                  Read more in the user guide.

                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                  Read more in the user guide.

                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                  Read more in the user guide.

                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnssktime.proba.Normal Predicted distribution.

                                                                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                  Read more in the user guide.

                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                  Read more in the user guide.

                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                  Read more in the user guide.

                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                  Read more in the user guide.

                                                                  Info

                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsfloat Metric score of y with respect to a ground truth.

                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                  Tip

                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                  port: int, default=8000 Port for HTTP server.

                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                  • If None: y is ignored.
                                                                  • If int: Position of the target column in X.
                                                                  • If str: Name of the target column in X.
                                                                  • If dict: Name of the target column and sequence of values.
                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                  "}, {"location": "API/models/gbm/", "title": "GradientBoostingMachine", "text": "

                                                                  GBM accept sparse

                                                                  A Gradient Boosting Machine builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the loss function, e.g. binary or multiclass log loss. Binary classification is a special case where only a single regression tree is induced.

                                                                  Corresponding estimators are:

                                                                  • GradientBoostingClassifier for classification tasks.
                                                                  • GradientBoostingRegressor for regression tasks.

                                                                  Read more in sklearn's documentation.

                                                                  Tip

                                                                  HistGradientBoosting is a much faster variant of this algorithm for intermediate datasets (n_samples >= 10k).

                                                                  See Also

                                                                  CatBoost Cat Boosting Machine.

                                                                  HistGradientBoosting Histogram-based Gradient Boosting Machine.

                                                                  LightGBM Light Gradient Boosting Machine.

                                                                  "}, {"location": "API/models/gbm/#example", "title": "Example", "text": "
                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GBM\nMetric: f1\n\n\nResults for GradientBoostingMachine:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9589\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 0.886s\n\n\nFinal results ==================== >>\nTotal time: 0.890s\n-------------------------------------\nGradientBoostingMachine --> f1: 0.9589\n
                                                                  "}, {"location": "API/models/gbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                  ParameterslossCategoricalDistribution(choices=('log_loss', 'exponential'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                  ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'huber', 'quantile'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)alphaFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)

                                                                  "}, {"location": "API/models/gbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gbm/#data-attributes", "title": "Data attributes", "text": "

                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                  Models that used automated feature scaling have the scaler added.

                                                                  Tip

                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                  "}, {"location": "API/models/gbm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                  Attributesname: strName of the model.

                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                  • [param_name]: Parameter value used in this trial.
                                                                  • estimator: Estimator used in this trial.
                                                                  • [metric_name]: Metric score of the trial.
                                                                  • [best_metric_name]: Best score so far in this study.
                                                                  • time_trial: Duration of the trial.
                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                    All durations are in seconds. Possible values include:

                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                    • [metric]_train: Metric score on the train set.
                                                                    • [metric]_test: Metric score on the test set.
                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                      "}, {"location": "API/models/gbm/#methods", "title": "Methods", "text": "

                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                      cols: int, default=2 Number of plots in width.

                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                      title: str, dict or None, default=None Title for the plot.

                                                                      • If None, no title is shown.
                                                                      • If str, text for the title.
                                                                      • If dict, title configuration.

                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                      • If None: No legend is shown.
                                                                      • If str: Location where to show the legend.
                                                                      • If dict: Legend configuration.

                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                      display: bool, default=True Whether to render the plot.

                                                                      Yieldsgo.Figure Plot object.

                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                      • In-training validation scores
                                                                      • Cached predictions.
                                                                      • Shap values
                                                                      • App instance
                                                                      • Dashboard instance
                                                                      • Calculated holdout data sets

                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                      Note

                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                      Returnspd.DataFrame Overview of the results.

                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                      Read more in the user guide.

                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                      Tip

                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                      • The task is binary or multilabel classification.
                                                                      • The model has a predict_proba method.
                                                                      • The metric evaluates predicted probabilities.

                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                      Returnspd.Series Scores of the model.

                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                      Warning

                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                      • If None: y is ignored.
                                                                      • If int: Position of the target column in X.
                                                                      • If str: Name of the target column in X.
                                                                      • If dict: Name of the target column and sequence of values.
                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                      series or dataframe Original target column. Only returned if provided.

                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                      Read more in the user guide.

                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                      Read more in the user guide.

                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                      Read more in the user guide.

                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                      Read more in the user guide.

                                                                      Info

                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                      • If None: X must be a selection of rows in the dataset.
                                                                      • If int: Position of the target column in X.
                                                                      • If str: Name of the target column in X.
                                                                      • If dict: Name of the target column and sequence of values.
                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsfloat Metric score of X with respect to y.

                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                      Tip

                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                      port: int, default=8000 Port for HTTP server.

                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                      • If None: y is ignored.
                                                                      • If int: Position of the target column in X.
                                                                      • If str: Name of the target column in X.
                                                                      • If dict: Name of the target column and sequence of values.
                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                      "}, {"location": "API/models/gnb/", "title": "GaussianNB", "text": "

                                                                      GNB supports acceleration

                                                                      Gaussian Naive Bayes implements the Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.

                                                                      Corresponding estimators are:

                                                                      • GaussianNB for classification tasks.

                                                                      Read more in sklearn's documentation.

                                                                      See Also

                                                                      BernoulliNB Bernoulli Naive Bayes.

                                                                      CategoricalNB Categorical Naive Bayes.

                                                                      ComplementNB Complement Naive Bayes.

                                                                      "}, {"location": "API/models/gnb/#example", "title": "Example", "text": "
                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9553\nTest evaluation --> f1: 0.9371\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.022s\n-------------------------------------\nGaussianNB --> f1: 0.9371\n
                                                                      "}, {"location": "API/models/gnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gnb/#data-attributes", "title": "Data attributes", "text": "

                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                      Models that used automated feature scaling have the scaler added.

                                                                      Tip

                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                      "}, {"location": "API/models/gnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                      Attributesname: strName of the model.

                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                      • [param_name]: Parameter value used in this trial.
                                                                      • estimator: Estimator used in this trial.
                                                                      • [metric_name]: Metric score of the trial.
                                                                      • [best_metric_name]: Best score so far in this study.
                                                                      • time_trial: Duration of the trial.
                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                        All durations are in seconds. Possible values include:

                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                        • [metric]_train: Metric score on the train set.
                                                                        • [metric]_test: Metric score on the test set.
                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                          "}, {"location": "API/models/gnb/#methods", "title": "Methods", "text": "

                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                          cols: int, default=2 Number of plots in width.

                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                          title: str, dict or None, default=None Title for the plot.

                                                                          • If None, no title is shown.
                                                                          • If str, text for the title.
                                                                          • If dict, title configuration.

                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                          • If None: No legend is shown.
                                                                          • If str: Location where to show the legend.
                                                                          • If dict: Legend configuration.

                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                          display: bool, default=True Whether to render the plot.

                                                                          Yieldsgo.Figure Plot object.

                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                          • In-training validation scores
                                                                          • Cached predictions.
                                                                          • Shap values
                                                                          • App instance
                                                                          • Dashboard instance
                                                                          • Calculated holdout data sets

                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                          Note

                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                          Returnspd.DataFrame Overview of the results.

                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                          Read more in the user guide.

                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                          Tip

                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                          • The task is binary or multilabel classification.
                                                                          • The model has a predict_proba method.
                                                                          • The metric evaluates predicted probabilities.

                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                          Returnspd.Series Scores of the model.

                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                          Warning

                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                          • If None: y is ignored.
                                                                          • If int: Position of the target column in X.
                                                                          • If str: Name of the target column in X.
                                                                          • If dict: Name of the target column and sequence of values.
                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                          series or dataframe Original target column. Only returned if provided.

                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                          Read more in the user guide.

                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                          Read more in the user guide.

                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                          Read more in the user guide.

                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                          Read more in the user guide.

                                                                          Info

                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                          • If None: X must be a selection of rows in the dataset.
                                                                          • If int: Position of the target column in X.
                                                                          • If str: Name of the target column in X.
                                                                          • If dict: Name of the target column and sequence of values.
                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsfloat Metric score of X with respect to y.

                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                          Tip

                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                          port: int, default=8000 Port for HTTP server.

                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                          • If None: y is ignored.
                                                                          • If int: Position of the target column in X.
                                                                          • If str: Name of the target column in X.
                                                                          • If dict: Name of the target column and sequence of values.
                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                          "}, {"location": "API/models/gp/", "title": "GaussianProcess", "text": "

                                                                          GP

                                                                          Gaussian Processes are a generic supervised learning method designed to solve regression and probabilistic classification problems. The advantages of Gaussian processes are:

                                                                          • The prediction interpolates the observations.
                                                                          • The prediction is probabilistic (Gaussian) so that one can compute empirical confidence intervals and decide based on those if one should refit (online fitting, adaptive fitting) the prediction in some region of interest.

                                                                          The disadvantages of Gaussian processes include:

                                                                          • They are not sparse, i.e., they use the whole samples/features information to perform the prediction.
                                                                          • They lose efficiency in high dimensional spaces, namely when the number of features exceeds a few dozens.

                                                                          Corresponding estimators are:

                                                                          • GaussianProcessClassifier for classification tasks.
                                                                          • GaussianProcessRegressor for regression tasks.

                                                                          Read more in sklearn's documentation.

                                                                          See Also

                                                                          GaussianNB Gaussian Naive Bayes.

                                                                          LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                          PassiveAggressive Passive Aggressive.

                                                                          "}, {"location": "API/models/gp/#example", "title": "Example", "text": "
                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GP\nMetric: f1\n\n\nResults for GaussianProcess:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9437\nTime elapsed: 0.105s\n-------------------------------------------------\nTime: 0.105s\n\n\nFinal results ==================== >>\nTotal time: 0.109s\n-------------------------------------\nGaussianProcess --> f1: 0.9437\n
                                                                          "}, {"location": "API/models/gp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gp/#data-attributes", "title": "Data attributes", "text": "

                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                          Models that used automated feature scaling have the scaler added.

                                                                          Tip

                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                          "}, {"location": "API/models/gp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                          Attributesname: strName of the model.

                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                          • [param_name]: Parameter value used in this trial.
                                                                          • estimator: Estimator used in this trial.
                                                                          • [metric_name]: Metric score of the trial.
                                                                          • [best_metric_name]: Best score so far in this study.
                                                                          • time_trial: Duration of the trial.
                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                            All durations are in seconds. Possible values include:

                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                            • [metric]_train: Metric score on the train set.
                                                                            • [metric]_test: Metric score on the test set.
                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                              "}, {"location": "API/models/gp/#methods", "title": "Methods", "text": "

                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                              cols: int, default=2 Number of plots in width.

                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                              title: str, dict or None, default=None Title for the plot.

                                                                              • If None, no title is shown.
                                                                              • If str, text for the title.
                                                                              • If dict, title configuration.

                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                              • If None: No legend is shown.
                                                                              • If str: Location where to show the legend.
                                                                              • If dict: Legend configuration.

                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                              display: bool, default=True Whether to render the plot.

                                                                              Yieldsgo.Figure Plot object.

                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                              • In-training validation scores
                                                                              • Cached predictions.
                                                                              • Shap values
                                                                              • App instance
                                                                              • Dashboard instance
                                                                              • Calculated holdout data sets

                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                              Note

                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                              Returnspd.DataFrame Overview of the results.

                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                              Read more in the user guide.

                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                              Tip

                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                              • The task is binary or multilabel classification.
                                                                              • The model has a predict_proba method.
                                                                              • The metric evaluates predicted probabilities.

                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                              Returnspd.Series Scores of the model.

                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                              Warning

                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                              • If None: y is ignored.
                                                                              • If int: Position of the target column in X.
                                                                              • If str: Name of the target column in X.
                                                                              • If dict: Name of the target column and sequence of values.
                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                              series or dataframe Original target column. Only returned if provided.

                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                              Read more in the user guide.

                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                              Read more in the user guide.

                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                              Read more in the user guide.

                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                              Read more in the user guide.

                                                                              Info

                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                              • If None: X must be a selection of rows in the dataset.
                                                                              • If int: Position of the target column in X.
                                                                              • If str: Name of the target column in X.
                                                                              • If dict: Name of the target column and sequence of values.
                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsfloat Metric score of X with respect to y.

                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                              Tip

                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                              port: int, default=8000 Port for HTTP server.

                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                              • If None: y is ignored.
                                                                              • If int: Position of the target column in X.
                                                                              • If str: Name of the target column in X.
                                                                              • If dict: Name of the target column and sequence of values.
                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                              "}, {"location": "API/models/hgbm/", "title": "HistGradientBoosting", "text": "

                                                                              hGBM

                                                                              This Histogram-based Gradient Boosting Machine is much faster than the standard GradientBoostingMachine for big datasets (n_samples>=10k). This variation first bins the input samples into integer-valued bins which tremendously reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees.

                                                                              Corresponding estimators are:

                                                                              • HistGradientBoostingClassifier for classification tasks.
                                                                              • HistGradientBoostingRegressor for regression tasks.

                                                                              Read more in sklearn's documentation.

                                                                              See Also

                                                                              CatBoost Cat Boosting Machine.

                                                                              GradientBoostingMachine Gradient Boosting Machine.

                                                                              XGBoost Extreme Gradient Boosting.

                                                                              "}, {"location": "API/models/hgbm/#example", "title": "Example", "text": "
                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"hGBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: hGBM\nMetric: f1\n\n\nResults for HistGradientBoosting:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.357s\n-------------------------------------------------\nTime: 0.357s\n\n\nFinal results ==================== >>\nTotal time: 0.360s\n-------------------------------------\nHistGradientBoosting --> f1: 0.9583\n
                                                                              "}, {"location": "API/models/hgbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                              Parameterslearning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                              ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson', 'quantile', 'gamma'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                              "}, {"location": "API/models/hgbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/hgbm/#data-attributes", "title": "Data attributes", "text": "

                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                              Models that used automated feature scaling have the scaler added.

                                                                              Tip

                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                              "}, {"location": "API/models/hgbm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                              Attributesname: strName of the model.

                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                              • [param_name]: Parameter value used in this trial.
                                                                              • estimator: Estimator used in this trial.
                                                                              • [metric_name]: Metric score of the trial.
                                                                              • [best_metric_name]: Best score so far in this study.
                                                                              • time_trial: Duration of the trial.
                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                All durations are in seconds. Possible values include:

                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                • [metric]_train: Metric score on the train set.
                                                                                • [metric]_test: Metric score on the test set.
                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                  "}, {"location": "API/models/hgbm/#methods", "title": "Methods", "text": "

                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                  cols: int, default=2 Number of plots in width.

                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                  • If None, no title is shown.
                                                                                  • If str, text for the title.
                                                                                  • If dict, title configuration.

                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                  • If None: No legend is shown.
                                                                                  • If str: Location where to show the legend.
                                                                                  • If dict: Legend configuration.

                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                  display: bool, default=True Whether to render the plot.

                                                                                  Yieldsgo.Figure Plot object.

                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                  • In-training validation scores
                                                                                  • Cached predictions.
                                                                                  • Shap values
                                                                                  • App instance
                                                                                  • Dashboard instance
                                                                                  • Calculated holdout data sets

                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                  Note

                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                  Read more in the user guide.

                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                  Tip

                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                  • The task is binary or multilabel classification.
                                                                                  • The model has a predict_proba method.
                                                                                  • The metric evaluates predicted probabilities.

                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                  Returnspd.Series Scores of the model.

                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                  Warning

                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                  • If None: y is ignored.
                                                                                  • If int: Position of the target column in X.
                                                                                  • If str: Name of the target column in X.
                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                  Read more in the user guide.

                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                  Read more in the user guide.

                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                  Read more in the user guide.

                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                  Read more in the user guide.

                                                                                  Info

                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                  • If int: Position of the target column in X.
                                                                                  • If str: Name of the target column in X.
                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                  Tip

                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                  • If None: y is ignored.
                                                                                  • If int: Position of the target column in X.
                                                                                  • If str: Name of the target column in X.
                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                  "}, {"location": "API/models/huber/", "title": "HuberRegression", "text": "

                                                                                  Huber needs scaling

                                                                                  Huber is a linear regression model that is robust to outliers. It makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect.

                                                                                  Corresponding estimators are:

                                                                                  • HuberRegressor for regression tasks.

                                                                                  Read more in sklearn's documentation.

                                                                                  See Also

                                                                                  AutomaticRelevanceDetermination Automatic Relevance Determination.

                                                                                  LeastAngleRegression Least Angle Regression.

                                                                                  OrdinaryLeastSquares Linear Regression.

                                                                                  "}, {"location": "API/models/huber/#example", "title": "Example", "text": "
                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Huber\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Huber\nMetric: r2\n\n\nResults for HuberRegression:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.546\nTest evaluation --> r2: 0.5999\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== >>\nTotal time: 0.187s\n-------------------------------------\nHuberRegression --> r2: 0.5999\n
                                                                                  "}, {"location": "API/models/huber/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                  ParametersepsilonFloatDistribution(high=10.0, log=True, low=1.0, step=None)max_iterIntDistribution(high=500, log=False, low=50, step=10)alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)

                                                                                  "}, {"location": "API/models/huber/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/huber/#data-attributes", "title": "Data attributes", "text": "

                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                  Tip

                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                  "}, {"location": "API/models/huber/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                  Attributesname: strName of the model.

                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                  • estimator: Estimator used in this trial.
                                                                                  • [metric_name]: Metric score of the trial.
                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                  • time_trial: Duration of the trial.
                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                    All durations are in seconds. Possible values include:

                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                    • [metric]_train: Metric score on the train set.
                                                                                    • [metric]_test: Metric score on the test set.
                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                      "}, {"location": "API/models/huber/#methods", "title": "Methods", "text": "

                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                      cols: int, default=2 Number of plots in width.

                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                      • If None, no title is shown.
                                                                                      • If str, text for the title.
                                                                                      • If dict, title configuration.

                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                      • If None: No legend is shown.
                                                                                      • If str: Location where to show the legend.
                                                                                      • If dict: Legend configuration.

                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                      display: bool, default=True Whether to render the plot.

                                                                                      Yieldsgo.Figure Plot object.

                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                      • In-training validation scores
                                                                                      • Cached predictions.
                                                                                      • Shap values
                                                                                      • App instance
                                                                                      • Dashboard instance
                                                                                      • Calculated holdout data sets

                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                      Note

                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                      Read more in the user guide.

                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                      Tip

                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                      • The task is binary or multilabel classification.
                                                                                      • The model has a predict_proba method.
                                                                                      • The metric evaluates predicted probabilities.

                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                      Returnspd.Series Scores of the model.

                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                      Warning

                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                      • If None: y is ignored.
                                                                                      • If int: Position of the target column in X.
                                                                                      • If str: Name of the target column in X.
                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                      Read more in the user guide.

                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                      Read more in the user guide.

                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                      Read more in the user guide.

                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                      Read more in the user guide.

                                                                                      Info

                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                      • If int: Position of the target column in X.
                                                                                      • If str: Name of the target column in X.
                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                      Tip

                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                      • If None: y is ignored.
                                                                                      • If int: Position of the target column in X.
                                                                                      • If str: Name of the target column in X.
                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                      "}, {"location": "API/models/knn/", "title": "KNearestNeighbors", "text": "

                                                                                      KNN needs scaling accept sparse native multilabel native multioutput supports acceleration

                                                                                      K-Nearest Neighbors, as the name clearly indicates, implements the k-nearest neighbors vote. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.

                                                                                      Corresponding estimators are:

                                                                                      • KNeighborsClassifier for classification tasks.
                                                                                      • KNeighborsRegressor for classification tasks.

                                                                                      Read more in sklearn's documentation.

                                                                                      See Also

                                                                                      LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                      QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                      RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                      "}, {"location": "API/models/knn/#example", "title": "Example", "text": "
                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"KNN\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.981\nTest evaluation --> f1: 0.9793\nTime elapsed: 0.116s\n-------------------------------------------------\nTime: 0.116s\n\n\nFinal results ==================== >>\nTotal time: 0.119s\n-------------------------------------\nKNearestNeighbors --> f1: 0.9793\n
                                                                                      "}, {"location": "API/models/knn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      cpugpu

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      sklearnsklearnexcuml

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      cpugpu

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                      "}, {"location": "API/models/knn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/knn/#data-attributes", "title": "Data attributes", "text": "

                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                      Tip

                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                      "}, {"location": "API/models/knn/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                      Attributesname: strName of the model.

                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                      • estimator: Estimator used in this trial.
                                                                                      • [metric_name]: Metric score of the trial.
                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                      • time_trial: Duration of the trial.
                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                        All durations are in seconds. Possible values include:

                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                        • [metric]_train: Metric score on the train set.
                                                                                        • [metric]_test: Metric score on the test set.
                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                          "}, {"location": "API/models/knn/#methods", "title": "Methods", "text": "

                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                          cols: int, default=2 Number of plots in width.

                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                          • If None, no title is shown.
                                                                                          • If str, text for the title.
                                                                                          • If dict, title configuration.

                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                          • If None: No legend is shown.
                                                                                          • If str: Location where to show the legend.
                                                                                          • If dict: Legend configuration.

                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                          display: bool, default=True Whether to render the plot.

                                                                                          Yieldsgo.Figure Plot object.

                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                          • In-training validation scores
                                                                                          • Cached predictions.
                                                                                          • Shap values
                                                                                          • App instance
                                                                                          • Dashboard instance
                                                                                          • Calculated holdout data sets

                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                          Note

                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                          Read more in the user guide.

                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                          Tip

                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                          • The task is binary or multilabel classification.
                                                                                          • The model has a predict_proba method.
                                                                                          • The metric evaluates predicted probabilities.

                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                          Returnspd.Series Scores of the model.

                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                          Warning

                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                          • If None: y is ignored.
                                                                                          • If int: Position of the target column in X.
                                                                                          • If str: Name of the target column in X.
                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                          Read more in the user guide.

                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                          Read more in the user guide.

                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                          Read more in the user guide.

                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                          Read more in the user guide.

                                                                                          Info

                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                          • If int: Position of the target column in X.
                                                                                          • If str: Name of the target column in X.
                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                          Tip

                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                          • If None: y is ignored.
                                                                                          • If int: Position of the target column in X.
                                                                                          • If str: Name of the target column in X.
                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                          "}, {"location": "API/models/lars/", "title": "LeastAngleRegression", "text": "

                                                                                          Lars needs scaling

                                                                                          Least-Angle Regression is a regression algorithm for high-dimensional data. Lars is similar to forward stepwise regression. At each step, it finds the feature most correlated with the target. When there are multiple features having equal correlation, instead of continuing along the same feature, it proceeds in a direction equiangular between the features.

                                                                                          Corresponding estimators are:

                                                                                          • Lars for regression tasks.

                                                                                          Read more in sklearn's documentation.

                                                                                          See Also

                                                                                          BayesianRidge Bayesian ridge regression.

                                                                                          HuberRegression Huber regressor.

                                                                                          OrdinaryLeastSquares Linear Regression.

                                                                                          "}, {"location": "API/models/lars/#example", "title": "Example", "text": "
                                                                                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Lars\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Lars\nMetric: r2\n\n\nResults for LeastAngleRegression:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== >>\nTotal time: 0.137s\n-------------------------------------\nLeastAngleRegression --> r2: 0.6028\n
                                                                                          "}, {"location": "API/models/lars/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lars/#data-attributes", "title": "Data attributes", "text": "

                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                          Tip

                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                          "}, {"location": "API/models/lars/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                          Attributesname: strName of the model.

                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                          • estimator: Estimator used in this trial.
                                                                                          • [metric_name]: Metric score of the trial.
                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                          • time_trial: Duration of the trial.
                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                            All durations are in seconds. Possible values include:

                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                            • [metric]_train: Metric score on the train set.
                                                                                            • [metric]_test: Metric score on the test set.
                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                              "}, {"location": "API/models/lars/#methods", "title": "Methods", "text": "

                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                              cols: int, default=2 Number of plots in width.

                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                              • If None, no title is shown.
                                                                                              • If str, text for the title.
                                                                                              • If dict, title configuration.

                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                              • If None: No legend is shown.
                                                                                              • If str: Location where to show the legend.
                                                                                              • If dict: Legend configuration.

                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                              display: bool, default=True Whether to render the plot.

                                                                                              Yieldsgo.Figure Plot object.

                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                              • In-training validation scores
                                                                                              • Cached predictions.
                                                                                              • Shap values
                                                                                              • App instance
                                                                                              • Dashboard instance
                                                                                              • Calculated holdout data sets

                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                              Note

                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                              Read more in the user guide.

                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                              Tip

                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                              • The task is binary or multilabel classification.
                                                                                              • The model has a predict_proba method.
                                                                                              • The metric evaluates predicted probabilities.

                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                              Returnspd.Series Scores of the model.

                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                              Warning

                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                              • If None: y is ignored.
                                                                                              • If int: Position of the target column in X.
                                                                                              • If str: Name of the target column in X.
                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                              Read more in the user guide.

                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                              Read more in the user guide.

                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                              Read more in the user guide.

                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                              Read more in the user guide.

                                                                                              Info

                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                              • If int: Position of the target column in X.
                                                                                              • If str: Name of the target column in X.
                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                              Tip

                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                              • If None: y is ignored.
                                                                                              • If int: Position of the target column in X.
                                                                                              • If str: Name of the target column in X.
                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                              "}, {"location": "API/models/lasso/", "title": "Lasso", "text": "

                                                                                              Lasso needs scaling accept sparse supports acceleration

                                                                                              Linear least squares with l1 regularization.

                                                                                              Corresponding estimators are:

                                                                                              • Lasso for regression tasks.

                                                                                              Read more in sklearn's documentation.

                                                                                              See Also

                                                                                              ElasticNet Linear Regression with elasticnet regularization.

                                                                                              OrdinaryLeastSquares Linear Regression.

                                                                                              Ridge Linear least squares with l2 regularization.

                                                                                              "}, {"location": "API/models/lasso/#example", "title": "Example", "text": "
                                                                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Lasso\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Lasso\nMetric: r2\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.0\nTest evaluation --> r2: -0.0001\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nLasso --> r2: -0.0001 ~\n
                                                                                              "}, {"location": "API/models/lasso/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                              cpugpu

                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                              "}, {"location": "API/models/lasso/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lasso/#data-attributes", "title": "Data attributes", "text": "

                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                              Tip

                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                              "}, {"location": "API/models/lasso/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                              Attributesname: strName of the model.

                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                              • estimator: Estimator used in this trial.
                                                                                              • [metric_name]: Metric score of the trial.
                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                              • time_trial: Duration of the trial.
                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                All durations are in seconds. Possible values include:

                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                  "}, {"location": "API/models/lasso/#methods", "title": "Methods", "text": "

                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                  • If None, no title is shown.
                                                                                                  • If str, text for the title.
                                                                                                  • If dict, title configuration.

                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                  • If None: No legend is shown.
                                                                                                  • If str: Location where to show the legend.
                                                                                                  • If dict: Legend configuration.

                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                  • In-training validation scores
                                                                                                  • Cached predictions.
                                                                                                  • Shap values
                                                                                                  • App instance
                                                                                                  • Dashboard instance
                                                                                                  • Calculated holdout data sets

                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                  Note

                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                  Read more in the user guide.

                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                  Tip

                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                  • The task is binary or multilabel classification.
                                                                                                  • The model has a predict_proba method.
                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                  Returnspd.Series Scores of the model.

                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                  Warning

                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                  • If None: y is ignored.
                                                                                                  • If int: Position of the target column in X.
                                                                                                  • If str: Name of the target column in X.
                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                  Read more in the user guide.

                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                  Read more in the user guide.

                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                  Read more in the user guide.

                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                  Read more in the user guide.

                                                                                                  Info

                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                  • If int: Position of the target column in X.
                                                                                                  • If str: Name of the target column in X.
                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                  Tip

                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                  • If None: y is ignored.
                                                                                                  • If int: Position of the target column in X.
                                                                                                  • If str: Name of the target column in X.
                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                  "}, {"location": "API/models/lda/", "title": "LinearDiscriminantAnalysis", "text": "

                                                                                                  LDA

                                                                                                  Linear Discriminant Analysis is a classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.

                                                                                                  Corresponding estimators are:

                                                                                                  • LinearDiscriminantAnalysis for classification tasks.

                                                                                                  Read more in sklearn's documentation.

                                                                                                  See Also

                                                                                                  LogisticRegression Logistic Regression.

                                                                                                  RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                                  QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                                  "}, {"location": "API/models/lda/#example", "title": "Example", "text": "
                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"LDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9743\nTest evaluation --> f1: 0.9726\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nFinal results ==================== >>\nTotal time: 0.028s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.9726\n
                                                                                                  "}, {"location": "API/models/lda/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                  ParameterssolverCategoricalDistribution(choices=('svd', 'lsqr', 'eigen'))shrinkageCategoricalDistribution(choices=(None, 'auto', 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))

                                                                                                  "}, {"location": "API/models/lda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lda/#data-attributes", "title": "Data attributes", "text": "

                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                  Tip

                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                  "}, {"location": "API/models/lda/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                  Attributesname: strName of the model.

                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                  • estimator: Estimator used in this trial.
                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                  • time_trial: Duration of the trial.
                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                    All durations are in seconds. Possible values include:

                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                      "}, {"location": "API/models/lda/#methods", "title": "Methods", "text": "

                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                      • If None, no title is shown.
                                                                                                      • If str, text for the title.
                                                                                                      • If dict, title configuration.

                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                      • If None: No legend is shown.
                                                                                                      • If str: Location where to show the legend.
                                                                                                      • If dict: Legend configuration.

                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                      • In-training validation scores
                                                                                                      • Cached predictions.
                                                                                                      • Shap values
                                                                                                      • App instance
                                                                                                      • Dashboard instance
                                                                                                      • Calculated holdout data sets

                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                      Note

                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                      Read more in the user guide.

                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                      Tip

                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                      • The task is binary or multilabel classification.
                                                                                                      • The model has a predict_proba method.
                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                      Returnspd.Series Scores of the model.

                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                      Warning

                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                      • If None: y is ignored.
                                                                                                      • If int: Position of the target column in X.
                                                                                                      • If str: Name of the target column in X.
                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                      Read more in the user guide.

                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                      Read more in the user guide.

                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                      Read more in the user guide.

                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                      Read more in the user guide.

                                                                                                      Info

                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                      • If int: Position of the target column in X.
                                                                                                      • If str: Name of the target column in X.
                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                      Tip

                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                      • If None: y is ignored.
                                                                                                      • If int: Position of the target column in X.
                                                                                                      • If str: Name of the target column in X.
                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                      "}, {"location": "API/models/lgb/", "title": "LightGBM", "text": "

                                                                                                      LGB needs scaling accept sparse allows validation supports acceleration

                                                                                                      LightGBM is a gradient boosting model that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

                                                                                                      • Faster training speed and higher efficiency.
                                                                                                      • Lower memory usage.
                                                                                                      • Better accuracy.
                                                                                                      • Capable of handling large-scale data.

                                                                                                      Corresponding estimators are:

                                                                                                      • LGBMClassifier for classification tasks.
                                                                                                      • LGBMRegressor for regression tasks.

                                                                                                      Read more in LightGBM's documentation.

                                                                                                      Info

                                                                                                      Using LightGBM's GPU acceleration requires additional software dependencies.

                                                                                                      See Also

                                                                                                      CatBoost Cat Boosting Machine.

                                                                                                      GradientBoostingMachine Gradient Boosting Machine.

                                                                                                      XGBoost Extreme Gradient Boosting.

                                                                                                      "}, {"location": "API/models/lgb/#example", "title": "Example", "text": "
                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"LGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: LGB\nMetric: f1\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.426s\n-------------------------------------------------\nTime: 0.426s\n\n\nFinal results ==================== >>\nTotal time: 0.429s\n-------------------------------------\nLightGBM --> f1: 0.9583\n
                                                                                                      "}, {"location": "API/models/lgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                      "}, {"location": "API/models/lgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lgb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                      Tip

                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                      "}, {"location": "API/models/lgb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                      Attributesname: strName of the model.

                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                      • estimator: Estimator used in this trial.
                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                      • time_trial: Duration of the trial.
                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                        Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                        All durations are in seconds. Possible values include:

                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                          "}, {"location": "API/models/lgb/#methods", "title": "Methods", "text": "

                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                          • If None, no title is shown.
                                                                                                          • If str, text for the title.
                                                                                                          • If dict, title configuration.

                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                          • If None: No legend is shown.
                                                                                                          • If str: Location where to show the legend.
                                                                                                          • If dict: Legend configuration.

                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                          • In-training validation scores
                                                                                                          • Cached predictions.
                                                                                                          • Shap values
                                                                                                          • App instance
                                                                                                          • Dashboard instance
                                                                                                          • Calculated holdout data sets

                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                          Note

                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                          Read more in the user guide.

                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                          Tip

                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                          • The task is binary or multilabel classification.
                                                                                                          • The model has a predict_proba method.
                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                          Returnspd.Series Scores of the model.

                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                          Warning

                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                          • If None: y is ignored.
                                                                                                          • If int: Position of the target column in X.
                                                                                                          • If str: Name of the target column in X.
                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                          Read more in the user guide.

                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                          Read more in the user guide.

                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                          Read more in the user guide.

                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                          Read more in the user guide.

                                                                                                          Info

                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                          • If int: Position of the target column in X.
                                                                                                          • If str: Name of the target column in X.
                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                          Tip

                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                          • If None: y is ignored.
                                                                                                          • If int: Position of the target column in X.
                                                                                                          • If str: Name of the target column in X.
                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                          "}, {"location": "API/models/lr/", "title": "LogisticRegression", "text": "

                                                                                                          LR needs scaling accept sparse supports acceleration

                                                                                                          Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.

                                                                                                          Corresponding estimators are:

                                                                                                          • LogisticRegression for classification tasks.

                                                                                                          Read more in sklearn's documentation.

                                                                                                          See Also

                                                                                                          GaussianProcess Gaussian process.

                                                                                                          LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                          PassiveAggressive Passive Aggressive.

                                                                                                          "}, {"location": "API/models/lr/#example", "title": "Example", "text": "
                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9524\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== >>\nTotal time: 0.232s\n-------------------------------------\nRandomForest --> f1: 0.9524\n
                                                                                                          "}, {"location": "API/models/lr/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                          cpugpu

                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                          "}, {"location": "API/models/lr/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lr/#data-attributes", "title": "Data attributes", "text": "

                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                          Tip

                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                          "}, {"location": "API/models/lr/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                          Attributesname: strName of the model.

                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                          • estimator: Estimator used in this trial.
                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                          • time_trial: Duration of the trial.
                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                            All durations are in seconds. Possible values include:

                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                              "}, {"location": "API/models/lr/#methods", "title": "Methods", "text": "

                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                              • If None, no title is shown.
                                                                                                              • If str, text for the title.
                                                                                                              • If dict, title configuration.

                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                              • If None: No legend is shown.
                                                                                                              • If str: Location where to show the legend.
                                                                                                              • If dict: Legend configuration.

                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                              • In-training validation scores
                                                                                                              • Cached predictions.
                                                                                                              • Shap values
                                                                                                              • App instance
                                                                                                              • Dashboard instance
                                                                                                              • Calculated holdout data sets

                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                              Note

                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                              Read more in the user guide.

                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                              Tip

                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                              • The task is binary or multilabel classification.
                                                                                                              • The model has a predict_proba method.
                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                              Returnspd.Series Scores of the model.

                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                              Warning

                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                              • If None: y is ignored.
                                                                                                              • If int: Position of the target column in X.
                                                                                                              • If str: Name of the target column in X.
                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                              Read more in the user guide.

                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                              Read more in the user guide.

                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                              Read more in the user guide.

                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                              Read more in the user guide.

                                                                                                              Info

                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                              • If int: Position of the target column in X.
                                                                                                              • If str: Name of the target column in X.
                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                              Tip

                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                              • If None: y is ignored.
                                                                                                              • If int: Position of the target column in X.
                                                                                                              • If str: Name of the target column in X.
                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                              "}, {"location": "API/models/lsvm/", "title": "LinearSVM", "text": "

                                                                                                              lSVM needs scaling accept sparse supports acceleration

                                                                                                              Similar to SupportVectorMachine but with a linear kernel. Implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

                                                                                                              Corresponding estimators are:

                                                                                                              • LinearSVC for classification tasks.
                                                                                                              • LinearSVR for classification tasks.

                                                                                                              Read more in sklearn's documentation.

                                                                                                              See Also

                                                                                                              KNearestNeighbors K-Nearest Neighbors.

                                                                                                              StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                              SupportVectorMachine Support Vector Machine.

                                                                                                              "}, {"location": "API/models/lsvm/#example", "title": "Example", "text": "
                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"lSVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: lSVM\nMetric: f1\n\n\nResults for LinearSVM:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.993\nTest evaluation --> f1: 0.9722\nTime elapsed: 0.089s\n-------------------------------------------------\nTime: 0.089s\n\n\nFinal results ==================== >>\nTotal time: 0.092s\n-------------------------------------\nLinearSVM --> f1: 0.9722\n
                                                                                                              "}, {"location": "API/models/lsvm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearncuml

                                                                                                              ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                              ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                              sklearncuml

                                                                                                              ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                              ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                              "}, {"location": "API/models/lsvm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lsvm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                              Tip

                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                              "}, {"location": "API/models/lsvm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                              Attributesname: strName of the model.

                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                              • estimator: Estimator used in this trial.
                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                              • time_trial: Duration of the trial.
                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                  "}, {"location": "API/models/lsvm/#methods", "title": "Methods", "text": "

                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                  • If None, no title is shown.
                                                                                                                  • If str, text for the title.
                                                                                                                  • If dict, title configuration.

                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                  • If None: No legend is shown.
                                                                                                                  • If str: Location where to show the legend.
                                                                                                                  • If dict: Legend configuration.

                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                  • In-training validation scores
                                                                                                                  • Cached predictions.
                                                                                                                  • Shap values
                                                                                                                  • App instance
                                                                                                                  • Dashboard instance
                                                                                                                  • Calculated holdout data sets

                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                  Note

                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                  Read more in the user guide.

                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                  Tip

                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                  • The model has a predict_proba method.
                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                  Warning

                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                  • If None: y is ignored.
                                                                                                                  • If int: Position of the target column in X.
                                                                                                                  • If str: Name of the target column in X.
                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                  Read more in the user guide.

                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                  Read more in the user guide.

                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                  Read more in the user guide.

                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                  Read more in the user guide.

                                                                                                                  Info

                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                  • If int: Position of the target column in X.
                                                                                                                  • If str: Name of the target column in X.
                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                  Tip

                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                  • If None: y is ignored.
                                                                                                                  • If int: Position of the target column in X.
                                                                                                                  • If str: Name of the target column in X.
                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                  "}, {"location": "API/models/mlp/", "title": "MultiLayerPerceptron", "text": "

                                                                                                                  MLP needs scaling accept sparse native multilabel allows validation

                                                                                                                  Multi-layer Perceptron is a supervised learning algorithm that learns a function by training on a dataset. Given a set of features and a target, it can learn a non-linear function approximator for either classification or regression. It is different from logistic regression, in that between the input and the output layer, there can be one or more non-linear layers, called hidden layers.

                                                                                                                  Corresponding estimators are:

                                                                                                                  • MLPClassifier for classification tasks.
                                                                                                                  • MLPRegressor for regression tasks.

                                                                                                                  Read more in sklearn's documentation.

                                                                                                                  See Also

                                                                                                                  PassiveAggressive Passive Aggressive.

                                                                                                                  Perceptron Linear Perceptron classification.

                                                                                                                  StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                  "}, {"location": "API/models/mlp/#example", "title": "Example", "text": "
                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"MLP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: MLP\nMetric: f1\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965\nTest evaluation --> f1: 0.979\nTime elapsed: 1.783s\n-------------------------------------------------\nTime: 1.783s\n\n\nFinal results ==================== >>\nTotal time: 1.786s\n-------------------------------------\nMultiLayerPerceptron --> f1: 0.979\n
                                                                                                                  "}, {"location": "API/models/mlp/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                  Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)

                                                                                                                  Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)

                                                                                                                  "}, {"location": "API/models/mlp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mlp/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                  Tip

                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                  "}, {"location": "API/models/mlp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                  Attributesname: strName of the model.

                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                    Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                      "}, {"location": "API/models/mlp/#methods", "title": "Methods", "text": "

                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                      • If None, no title is shown.
                                                                                                                      • If str, text for the title.
                                                                                                                      • If dict, title configuration.

                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                      • If None: No legend is shown.
                                                                                                                      • If str: Location where to show the legend.
                                                                                                                      • If dict: Legend configuration.

                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                      • In-training validation scores
                                                                                                                      • Cached predictions.
                                                                                                                      • Shap values
                                                                                                                      • App instance
                                                                                                                      • Dashboard instance
                                                                                                                      • Calculated holdout data sets

                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                      Note

                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                      Read more in the user guide.

                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                      Tip

                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                      • The model has a predict_proba method.
                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                      Warning

                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                      • If None: y is ignored.
                                                                                                                      • If int: Position of the target column in X.
                                                                                                                      • If str: Name of the target column in X.
                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                      Read more in the user guide.

                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                      Read more in the user guide.

                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                      Read more in the user guide.

                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                      Read more in the user guide.

                                                                                                                      Info

                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                      • If int: Position of the target column in X.
                                                                                                                      • If str: Name of the target column in X.
                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                      Tip

                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                      • If None: y is ignored.
                                                                                                                      • If int: Position of the target column in X.
                                                                                                                      • If str: Name of the target column in X.
                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                      "}, {"location": "API/models/mnb/", "title": "MultinomialNB", "text": "

                                                                                                                      MNB accept sparse supports acceleration

                                                                                                                      MultinomialNB implements the Naive Bayes algorithm for multinomially distributed data, and is one of the two classic Naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).

                                                                                                                      Corresponding estimators are:

                                                                                                                      • MultinomialNB for classification tasks.

                                                                                                                      Read more in sklearn's documentation.

                                                                                                                      See Also

                                                                                                                      BernoulliNB Bernoulli Naive Bayes.

                                                                                                                      ComplementNB Complement Naive Bayes.

                                                                                                                      GaussianNB Gaussian Naive Bayes.

                                                                                                                      "}, {"location": "API/models/mnb/#example", "title": "Example", "text": "
                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"MNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: MNB\nMetric: f1\n\n\nResults for MultinomialNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9238\nTest evaluation --> f1: 0.9128\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.024s\n-------------------------------------\nMultinomialNB --> f1: 0.9128\n
                                                                                                                      "}, {"location": "API/models/mnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                      "}, {"location": "API/models/mnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                      Tip

                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                      "}, {"location": "API/models/mnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                      Attributesname: strName of the model.

                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                          "}, {"location": "API/models/mnb/#methods", "title": "Methods", "text": "

                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                          • If None, no title is shown.
                                                                                                                          • If str, text for the title.
                                                                                                                          • If dict, title configuration.

                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                          • If None: No legend is shown.
                                                                                                                          • If str: Location where to show the legend.
                                                                                                                          • If dict: Legend configuration.

                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                          • In-training validation scores
                                                                                                                          • Cached predictions.
                                                                                                                          • Shap values
                                                                                                                          • App instance
                                                                                                                          • Dashboard instance
                                                                                                                          • Calculated holdout data sets

                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                          Note

                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                          Read more in the user guide.

                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                          Tip

                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                          • The model has a predict_proba method.
                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                          Warning

                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                          • If None: y is ignored.
                                                                                                                          • If int: Position of the target column in X.
                                                                                                                          • If str: Name of the target column in X.
                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                          Read more in the user guide.

                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                          Read more in the user guide.

                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                          Read more in the user guide.

                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                          Read more in the user guide.

                                                                                                                          Info

                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                          • If int: Position of the target column in X.
                                                                                                                          • If str: Name of the target column in X.
                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                          Tip

                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                          • If None: y is ignored.
                                                                                                                          • If int: Position of the target column in X.
                                                                                                                          • If str: Name of the target column in X.
                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                          "}, {"location": "API/models/nf/", "title": "NaiveForecaster", "text": "

                                                                                                                          NF native multioutput

                                                                                                                          NaiveForecaster is a dummy forecaster that makes forecasts using simple strategies based on naive assumptions about past trends continuing. When used in multivariate tasks, each column is forecasted with the same strategy.

                                                                                                                          Corresponding estimators are:

                                                                                                                          • NaiveForecaster for forecasting tasks.

                                                                                                                          See Also

                                                                                                                          ExponentialSmoothing Exponential Smoothing forecaster.

                                                                                                                          Dummy Dummy classifier/regressor.

                                                                                                                          PolynomialTrend Polynomial Trend forecaster.

                                                                                                                          "}, {"location": "API/models/nf/#example", "title": "Example", "text": "
                                                                                                                          >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"NF\", verbose=2)\n\n\nTraining ========================= >>\nModels: NF\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.022s\n-------------------------------------------------\nTime: 0.022s\n\n\nFinal results ==================== >>\nTotal time: 0.023s\n-------------------------------------\nNaiveForecaster --> mape: -0.2305\n
                                                                                                                          "}, {"location": "API/models/nf/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                          ParametersstrategyCategoricalDistribution(choices=('last', 'mean', 'drift'))

                                                                                                                          "}, {"location": "API/models/nf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/nf/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                          Tip

                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                          "}, {"location": "API/models/nf/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                          Attributesname: strName of the model.

                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                              "}, {"location": "API/models/nf/#methods", "title": "Methods", "text": "

                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                              • If None, no title is shown.
                                                                                                                              • If str, text for the title.
                                                                                                                              • If dict, title configuration.

                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                              • If None: No legend is shown.
                                                                                                                              • If str: Location where to show the legend.
                                                                                                                              • If dict: Legend configuration.

                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                              • In-training validation scores
                                                                                                                              • Cached predictions.
                                                                                                                              • Shap values
                                                                                                                              • App instance
                                                                                                                              • Dashboard instance
                                                                                                                              • Calculated holdout data sets

                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                              Note

                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                              Tip

                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                              • The model has a predict_proba method.
                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                              Warning

                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                              • If None: y is ignored.
                                                                                                                              • If int: Position of the target column in X.
                                                                                                                              • If str: Name of the target column in X.
                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                              method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                              method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                              method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnssktime.proba.Normal Predicted distribution.

                                                                                                                              method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                              method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                              method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                              Read more in the user guide.

                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                              method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                              Read more in the user guide.

                                                                                                                              Info

                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                              fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                              Tip

                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                              • If None: y is ignored.
                                                                                                                              • If int: Position of the target column in X.
                                                                                                                              • If str: Name of the target column in X.
                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                              "}, {"location": "API/models/ols/", "title": "OrdinaryLeastSquares", "text": "

                                                                                                                              OLS needs scaling accept sparse supports acceleration

                                                                                                                              Ordinary Least Squares is just linear regression without any regularization. It fits a linear model with coefficients w=(w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

                                                                                                                              Corresponding estimators are:

                                                                                                                              • LinearRegression for regression tasks.

                                                                                                                              Read more in sklearn's documentation.

                                                                                                                              See Also

                                                                                                                              ElasticNet Linear Regression with elasticnet regularization.

                                                                                                                              Lasso Linear Regression with lasso regularization.

                                                                                                                              Ridge Linear least squares with l2 regularization.

                                                                                                                              "}, {"location": "API/models/ols/#example", "title": "Example", "text": "
                                                                                                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"OLS\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: OLS\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.138s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.6028\n
                                                                                                                              "}, {"location": "API/models/ols/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ols/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                              Tip

                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                              "}, {"location": "API/models/ols/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                              Attributesname: strName of the model.

                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                  "}, {"location": "API/models/ols/#methods", "title": "Methods", "text": "

                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                  • If None, no title is shown.
                                                                                                                                  • If str, text for the title.
                                                                                                                                  • If dict, title configuration.

                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                  • If None: No legend is shown.
                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                  • In-training validation scores
                                                                                                                                  • Cached predictions.
                                                                                                                                  • Shap values
                                                                                                                                  • App instance
                                                                                                                                  • Dashboard instance
                                                                                                                                  • Calculated holdout data sets

                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                  Note

                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                  Read more in the user guide.

                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                  Tip

                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                  Warning

                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                  • If None: y is ignored.
                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                  Read more in the user guide.

                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                  Read more in the user guide.

                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                  Read more in the user guide.

                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                  Read more in the user guide.

                                                                                                                                  Info

                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                  Tip

                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                  • If None: y is ignored.
                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                  "}, {"location": "API/models/omp/", "title": "OrthogonalMatchingPursuit", "text": "

                                                                                                                                  OMP needs scaling

                                                                                                                                  Orthogonal Matching Pursuit implements the OMP algorithm for approximating the fit of a linear model with constraints imposed on the number of non-zero coefficients.

                                                                                                                                  Corresponding estimators are:

                                                                                                                                  • OrthogonalMatchingPursuit for regression tasks.

                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                  See Also

                                                                                                                                  Lasso Linear Regression with lasso regularization.

                                                                                                                                  LeastAngleRegression Least Angle Regression.

                                                                                                                                  OrdinaryLeastSquares Linear Regression.

                                                                                                                                  "}, {"location": "API/models/omp/#example", "title": "Example", "text": "
                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"OMP\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: OMP\nMetric: r2\n\n\nResults for OrthogonalMatchingPursuit:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.4751\nTest evaluation --> r2: 0.4668\nTime elapsed: 0.135s\n-------------------------------------------------\nTime: 0.135s\n\n\nFinal results ==================== >>\nTotal time: 0.136s\n-------------------------------------\nOrthogonalMatchingPursuit --> r2: 0.4668\n
                                                                                                                                  "}, {"location": "API/models/omp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/omp/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                  Tip

                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                  "}, {"location": "API/models/omp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                      "}, {"location": "API/models/omp/#methods", "title": "Methods", "text": "

                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                      • If None, no title is shown.
                                                                                                                                      • If str, text for the title.
                                                                                                                                      • If dict, title configuration.

                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                      • If None: No legend is shown.
                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                      • In-training validation scores
                                                                                                                                      • Cached predictions.
                                                                                                                                      • Shap values
                                                                                                                                      • App instance
                                                                                                                                      • Dashboard instance
                                                                                                                                      • Calculated holdout data sets

                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                      Note

                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                      Read more in the user guide.

                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                      Tip

                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                      Warning

                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                      • If None: y is ignored.
                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                      Read more in the user guide.

                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                      Read more in the user guide.

                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                      Read more in the user guide.

                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                      Read more in the user guide.

                                                                                                                                      Info

                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                      Tip

                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                      • If None: y is ignored.
                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                      "}, {"location": "API/models/pa/", "title": "PassiveAggressive", "text": "

                                                                                                                                      PA needs scaling accept sparse allows validation

                                                                                                                                      The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter C.

                                                                                                                                      Corresponding estimators are:

                                                                                                                                      • PassiveAggressiveClassifier for classification tasks.
                                                                                                                                      • PassiveAggressiveRegressor for classification tasks.

                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                      See Also

                                                                                                                                      MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                      Perceptron Linear Perceptron classification.

                                                                                                                                      StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                      "}, {"location": "API/models/pa/#example", "title": "Example", "text": "
                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"PA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: PA\nMetric: f1\n\n\nResults for PassiveAggressive:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965\nTest evaluation --> f1: 0.9504\nTime elapsed: 5.512s\n-------------------------------------------------\nTime: 5.512s\n\n\nFinal results ==================== >>\nTotal time: 5.515s\n-------------------------------------\nPassiveAggressive --> f1: 0.9504\n
                                                                                                                                      "}, {"location": "API/models/pa/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))averageCategoricalDistribution(choices=(True, False))

                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))averageCategoricalDistribution(choices=(True, False))

                                                                                                                                      "}, {"location": "API/models/pa/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pa/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                      Tip

                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                      "}, {"location": "API/models/pa/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                        Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                          "}, {"location": "API/models/pa/#methods", "title": "Methods", "text": "

                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                          • If None, no title is shown.
                                                                                                                                          • If str, text for the title.
                                                                                                                                          • If dict, title configuration.

                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                          • If None: No legend is shown.
                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                          • In-training validation scores
                                                                                                                                          • Cached predictions.
                                                                                                                                          • Shap values
                                                                                                                                          • App instance
                                                                                                                                          • Dashboard instance
                                                                                                                                          • Calculated holdout data sets

                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                          Note

                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                          Read more in the user guide.

                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                          Tip

                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                          Warning

                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                          • If None: y is ignored.
                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                          Read more in the user guide.

                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                          Read more in the user guide.

                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                          Read more in the user guide.

                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                          Read more in the user guide.

                                                                                                                                          Info

                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                          Tip

                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                          • If None: y is ignored.
                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                          "}, {"location": "API/models/perc/", "title": "Perceptron", "text": "

                                                                                                                                          Perc needs scaling allows validation

                                                                                                                                          The Perceptron is a simple classification algorithm suitable for large scale learning. By default:

                                                                                                                                          • It does not require a learning rate.
                                                                                                                                          • It is not regularized (penalized).
                                                                                                                                          • It updates its model only on mistakes.

                                                                                                                                          The last characteristic implies that the Perceptron is slightly faster to train than StochasticGradientDescent with the hinge loss and that the resulting models are sparser.

                                                                                                                                          Corresponding estimators are:

                                                                                                                                          • Perceptron for classification tasks.

                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                          See Also

                                                                                                                                          MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                          PassiveAggressive Passive Aggressive.

                                                                                                                                          StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                          "}, {"location": "API/models/perc/#example", "title": "Example", "text": "
                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Perc\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Perc\nMetric: f1\n\n\nResults for Perceptron:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9577\nTime elapsed: 5.509s\n-------------------------------------------------\nTime: 5.509s\n\n\nFinal results ==================== >>\nTotal time: 5.512s\n-------------------------------------\nPerceptron --> f1: 0.9577\n
                                                                                                                                          "}, {"location": "API/models/perc/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l2', 'l1', 'elasticnet'))alphaFloatDistribution(high=10.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)

                                                                                                                                          "}, {"location": "API/models/perc/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/perc/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                          Tip

                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                          "}, {"location": "API/models/perc/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                            Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                              "}, {"location": "API/models/perc/#methods", "title": "Methods", "text": "

                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                              • If None, no title is shown.
                                                                                                                                              • If str, text for the title.
                                                                                                                                              • If dict, title configuration.

                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                              • If None: No legend is shown.
                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                              • In-training validation scores
                                                                                                                                              • Cached predictions.
                                                                                                                                              • Shap values
                                                                                                                                              • App instance
                                                                                                                                              • Dashboard instance
                                                                                                                                              • Calculated holdout data sets

                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                              Note

                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                              Read more in the user guide.

                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                              Tip

                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                              Warning

                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                              • If None: y is ignored.
                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                              Read more in the user guide.

                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                              Read more in the user guide.

                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                              Read more in the user guide.

                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                              Read more in the user guide.

                                                                                                                                              Info

                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                              Tip

                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                              • If None: y is ignored.
                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                              "}, {"location": "API/models/pt/", "title": "PolynomialTrend", "text": "

                                                                                                                                              PT native multioutput

                                                                                                                                              Forecast time series data with a polynomial trend, using a sklearn LinearRegression class to regress values of time series on index, after extraction of polynomial features.

                                                                                                                                              Corresponding estimators are:

                                                                                                                                              • PolynomialTrendForecaster for forecasting tasks.

                                                                                                                                              See Also

                                                                                                                                              ARIMA Autoregressive Integrated Moving Average Model.

                                                                                                                                              ETS ETS model with automatic fitting capabilities.

                                                                                                                                              NaiveForecaster Naive Forecaster.

                                                                                                                                              "}, {"location": "API/models/pt/#example", "title": "Example", "text": "
                                                                                                                                              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"PT\", verbose=2)\n\n\nTraining ========================= >>\nModels: PT\nMetric: mape\n\n\nResults for PolynomialTrend:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.1196\nTest evaluation --> mape: -0.1181\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.019s\n-------------------------------------\nPolynomialTrend --> mape: -0.1181\n
                                                                                                                                              "}, {"location": "API/models/pt/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                              ParametersdegreeIntDistribution(high=5, log=False, low=1, step=1)with_interceptCategoricalDistribution(choices=(True, False))

                                                                                                                                              "}, {"location": "API/models/pt/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pt/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                              Tip

                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                              "}, {"location": "API/models/pt/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                  "}, {"location": "API/models/pt/#methods", "title": "Methods", "text": "

                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                  • If str, text for the title.
                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                  • In-training validation scores
                                                                                                                                                  • Cached predictions.
                                                                                                                                                  • Shap values
                                                                                                                                                  • App instance
                                                                                                                                                  • Dashboard instance
                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                  Note

                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                  Tip

                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                  Warning

                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                  Read more in the user guide.

                                                                                                                                                  Info

                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                  Tip

                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                  "}, {"location": "API/models/qda/", "title": "QuadraticDiscriminantAnalysis", "text": "

                                                                                                                                                  QDA

                                                                                                                                                  Quadratic Discriminant Analysis is a classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.

                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                  • QuadraticDiscriminantAnalysis for classification tasks.

                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                  See Also

                                                                                                                                                  LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                  LogisticRegression Logistic Regression.

                                                                                                                                                  RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                                                                                  "}, {"location": "API/models/qda/#example", "title": "Example", "text": "
                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"QDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: QDA\nMetric: f1\n\n\nResults for QuadraticDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9809\nTest evaluation --> f1: 0.9504\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== >>\nTotal time: 0.026s\n-------------------------------------\nQuadraticDiscriminantAnalysis --> f1: 0.9504\n
                                                                                                                                                  "}, {"location": "API/models/qda/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                  Parametersreg_paramFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                  "}, {"location": "API/models/qda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/qda/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                  Tip

                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                  "}, {"location": "API/models/qda/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                      "}, {"location": "API/models/qda/#methods", "title": "Methods", "text": "

                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                      • If str, text for the title.
                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                      • In-training validation scores
                                                                                                                                                      • Cached predictions.
                                                                                                                                                      • Shap values
                                                                                                                                                      • App instance
                                                                                                                                                      • Dashboard instance
                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                      Note

                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                      Read more in the user guide.

                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                      Tip

                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                      Warning

                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                      Read more in the user guide.

                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                      Read more in the user guide.

                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                      Read more in the user guide.

                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                      Read more in the user guide.

                                                                                                                                                      Info

                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                      Tip

                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                      "}, {"location": "API/models/rf/", "title": "RandomForest", "text": "

                                                                                                                                                      RF accept sparse native multilabel native multioutput supports acceleration

                                                                                                                                                      Random forests are an ensemble learning method that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random forests correct for decision trees' habit of overfitting to their training set.

                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                      • RandomForestClassifier for classification tasks.
                                                                                                                                                      • RandomForestRegressor for regression tasks.

                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                      Warning

                                                                                                                                                      cuML's implementation of RandomForestClassifier only supports predictions on dtype float32. Convert all dtypes before calling atom's run method to avoid exceptions.

                                                                                                                                                      See Also

                                                                                                                                                      DecisionTree Single Decision Tree.

                                                                                                                                                      ExtraTrees Extremely Randomized Trees.

                                                                                                                                                      HistGradientBoosting Histogram-based Gradient Boosting Machine.

                                                                                                                                                      "}, {"location": "API/models/rf/#example", "title": "Example", "text": "
                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9524\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 0.232s\n\n\nFinal results ==================== >>\nTotal time: 0.236s\n-------------------------------------\nRandomForest --> f1: 0.9524\n
                                                                                                                                                      "}, {"location": "API/models/rf/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      cpugpu

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      sklearnsklearnexcuml

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      cpugpu

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                      "}, {"location": "API/models/rf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rf/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                      Tip

                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                      "}, {"location": "API/models/rf/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                          "}, {"location": "API/models/rf/#methods", "title": "Methods", "text": "

                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                          • If str, text for the title.
                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                          • In-training validation scores
                                                                                                                                                          • Cached predictions.
                                                                                                                                                          • Shap values
                                                                                                                                                          • App instance
                                                                                                                                                          • Dashboard instance
                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                          Note

                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                          Read more in the user guide.

                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                          Tip

                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                          Warning

                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                          Read more in the user guide.

                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                          Read more in the user guide.

                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                          Read more in the user guide.

                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                          Read more in the user guide.

                                                                                                                                                          Info

                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                          Tip

                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                          "}, {"location": "API/models/ridge/", "title": "Ridge", "text": "

                                                                                                                                                          Ridge needs scaling accept sparse native multilabel supports acceleration

                                                                                                                                                          If classifier, it first converts the target values into {-1, 1} and then treats the problem as a regression task.

                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                          • RidgeClassifier for classification tasks.
                                                                                                                                                          • Ridge for regression tasks.

                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                          Warning

                                                                                                                                                          Engines sklearnex and cuml are only available for regression tasks.

                                                                                                                                                          See Also

                                                                                                                                                          BayesianRidge Bayesian ridge regression.

                                                                                                                                                          ElasticNet Linear Regression with elasticnet regularization.

                                                                                                                                                          Lasso Linear Regression with lasso regularization.

                                                                                                                                                          "}, {"location": "API/models/ridge/#example", "title": "Example", "text": "
                                                                                                                                                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Ridge\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Ridge\nMetric: r2\n\n\nResults for Ridge:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== >>\nTotal time: 0.137s\n-------------------------------------\nRidge --> r2: 0.6028\n
                                                                                                                                                          "}, {"location": "API/models/ridge/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          cpugpu

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          sklearnsklearnexcuml

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          cpugpu

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                          "}, {"location": "API/models/ridge/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ridge/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                          Tip

                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                          "}, {"location": "API/models/ridge/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                              "}, {"location": "API/models/ridge/#methods", "title": "Methods", "text": "

                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                              • If str, text for the title.
                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                              • In-training validation scores
                                                                                                                                                              • Cached predictions.
                                                                                                                                                              • Shap values
                                                                                                                                                              • App instance
                                                                                                                                                              • Dashboard instance
                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                              Note

                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                              Read more in the user guide.

                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                              Tip

                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                              Warning

                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                              Read more in the user guide.

                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                              Read more in the user guide.

                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                              Read more in the user guide.

                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                              Read more in the user guide.

                                                                                                                                                              Info

                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                              Tip

                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                              "}, {"location": "API/models/rnn/", "title": "RadiusNearestNeighbors", "text": "

                                                                                                                                                              RNN needs scaling accept sparse native multilabel native multioutput

                                                                                                                                                              Radius Nearest Neighbors implements the nearest neighbors vote, where the neighbors are selected from within a given radius. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.

                                                                                                                                                              Warning

                                                                                                                                                              • The radius parameter should be tuned to the data at hand or the model will perform poorly.
                                                                                                                                                              • If outliers are detected, the estimator raises an exception unless est_params={\"outlier_label\": \"most_frequent\"} is used.

                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                              • RadiusNeighborsClassifier for classification tasks.
                                                                                                                                                              • RadiusNeighborsRegressor for regression tasks.

                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                              See Also

                                                                                                                                                              KNearestNeighbors K-Nearest Neighbors.

                                                                                                                                                              LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                              QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                                                                                              "}, {"location": "API/models/rnn/#example", "title": "Example", "text": "
                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RNN\",\n...     metric=\"f1\",\n...     est_params={\"outlier_label\": \"most_frequent\"},\n...     verbose=2,\n... )\n\n\nTraining ========================= >>\nModels: RNN\nMetric: f1\n\n\nResults for RadiusNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.091s\n-------------------------------------------------\nTime: 0.091s\n\n\nFinal results ==================== >>\nTotal time: 0.094s\n-------------------------------------\nRadiusNearestNeighbors --> f1: 0.7717 ~\n
                                                                                                                                                              "}, {"location": "API/models/rnn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                              ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                              ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                              "}, {"location": "API/models/rnn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rnn/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                              Tip

                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                              "}, {"location": "API/models/rnn/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                  "}, {"location": "API/models/rnn/#methods", "title": "Methods", "text": "

                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                  • Shap values
                                                                                                                                                                  • App instance
                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                  Note

                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                  Tip

                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                  Warning

                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                  Info

                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                  Tip

                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                  "}, {"location": "API/models/sgd/", "title": "StochasticGradientDescent", "text": "

                                                                                                                                                                  SGD needs scaling accept sparse allows validation

                                                                                                                                                                  Stochastic Gradient Descent is a simple yet very efficient approach to fitting linear classifiers and regressors under convex loss functions. Even though SGD has been around in the machine learning community for a long time, it has received a considerable amount of attention just recently in the context of large-scale learning.

                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                  • SGDClassifier for classification tasks.
                                                                                                                                                                  • SGDRegressor for regression tasks.

                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                  See Also

                                                                                                                                                                  MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                  PassiveAggressive Passive Aggressive.

                                                                                                                                                                  SupportVectorMachine Support Vector Machine.

                                                                                                                                                                  "}, {"location": "API/models/sgd/#example", "title": "Example", "text": "
                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"SGD\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: SGD\nMetric: f1\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9948\nTest evaluation --> f1: 0.9722\nTime elapsed: 5.506s\n-------------------------------------------------\nTime: 5.506s\n\n\nFinal results ==================== >>\nTotal time: 5.509s\n-------------------------------------\nStochasticGradientDescent --> f1: 0.9722\n
                                                                                                                                                                  "}, {"location": "API/models/sgd/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                  "}, {"location": "API/models/sgd/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/sgd/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                  Tip

                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                  "}, {"location": "API/models/sgd/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                    Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                      "}, {"location": "API/models/sgd/#methods", "title": "Methods", "text": "

                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                      • Shap values
                                                                                                                                                                      • App instance
                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                      Note

                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                      Tip

                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                      Warning

                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                      Info

                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                      Tip

                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                      "}, {"location": "API/models/svm/", "title": "SupportVectorMachine", "text": "

                                                                                                                                                                      SVM needs scaling accept sparse supports acceleration

                                                                                                                                                                      The implementation of the Support Vector Machine is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using a LinearSVM or a StochasticGradientDescent model instead.

                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                      • SVC for classification tasks.
                                                                                                                                                                      • SVR for classification tasks.

                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                      See Also

                                                                                                                                                                      LinearSVM Linear Support Vector Machine.

                                                                                                                                                                      MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                      StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                      "}, {"location": "API/models/svm/#example", "title": "Example", "text": "
                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"SVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: SVM\nMetric: f1\n\n\nResults for SupportVectorMachine:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.979\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== >>\nTotal time: 0.098s\n-------------------------------------\nSupportVectorMachine --> f1: 0.979\n
                                                                                                                                                                      "}, {"location": "API/models/svm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      cpugpu

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      sklearnsklearnexcuml

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      cpugpu

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                      "}, {"location": "API/models/svm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/svm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                      Tip

                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                      "}, {"location": "API/models/svm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                          "}, {"location": "API/models/svm/#methods", "title": "Methods", "text": "

                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                          • Shap values
                                                                                                                                                                          • App instance
                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                          Note

                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                          Tip

                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                          Warning

                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                          Info

                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                          Tip

                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                          "}, {"location": "API/models/tree/", "title": "DecisionTree", "text": "

                                                                                                                                                                          Tree accept sparse native multilabel native multioutput

                                                                                                                                                                          A single decision tree classifier/regressor.

                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                          • DecisionTreeClassifier for classification tasks.
                                                                                                                                                                          • DecisionTreeRegressor for regression tasks.

                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                          See Also

                                                                                                                                                                          ExtraTree Extremely Randomized Tree.

                                                                                                                                                                          ExtraTrees Extremely Randomized Trees.

                                                                                                                                                                          RandomForest Random Forest.

                                                                                                                                                                          "}, {"location": "API/models/tree/#example", "title": "Example", "text": "
                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Tree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Tree\nMetric: f1\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9589\nTime elapsed: 0.032s\n-------------------------------------------------\nTime: 0.032s\n\n\nFinal results ==================== >>\nTotal time: 0.035s\n-------------------------------------\nDecisionTree --> f1: 0.9589\n
                                                                                                                                                                          "}, {"location": "API/models/tree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'friedman_mse', 'poisson'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                          "}, {"location": "API/models/tree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/tree/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                          Tip

                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                          "}, {"location": "API/models/tree/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                              "}, {"location": "API/models/tree/#methods", "title": "Methods", "text": "

                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                              • Shap values
                                                                                                                                                                              • App instance
                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                              Note

                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                              Tip

                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                              Warning

                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                              Info

                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                              Tip

                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                              "}, {"location": "API/models/xgb/", "title": "XGBoost", "text": "

                                                                                                                                                                              XGB needs scaling accept sparse allows validation supports acceleration

                                                                                                                                                                              XGBoost is an optimized distributed gradient boosting model designed to be highly efficient, flexible and portable. XGBoost provides a parallel tree boosting that solve many data science problems in a fast and accurate way.

                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                              • XGBClassifier for classification tasks.
                                                                                                                                                                              • XGBRegressor for regression tasks.

                                                                                                                                                                              Read more in XGBoost's documentation.

                                                                                                                                                                              See Also

                                                                                                                                                                              CatBoost Cat Boosting Machine.

                                                                                                                                                                              GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                              LightGBM Light Gradient Boosting Machine.

                                                                                                                                                                              "}, {"location": "API/models/xgb/#example", "title": "Example", "text": "
                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"XGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: XGB\nMetric: f1\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.401s\n-------------------------------------------------\nTime: 0.401s\n\n\nFinal results ==================== >>\nTotal time: 0.404s\n-------------------------------------\nXGBoost --> f1: 0.9583\n
                                                                                                                                                                              "}, {"location": "API/models/xgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                              "}, {"location": "API/models/xgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/xgb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                              Tip

                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                              "}, {"location": "API/models/xgb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                  "}, {"location": "API/models/xgb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                  Note

                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Info

                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/", "title": "TextCleaner", "text": "

                                                                                                                                                                                  class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]Applies standard text cleaning to the corpus.

                                                                                                                                                                                  Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  This class can be accessed from atom through the textclean method. Read more in the user guide.

                                                                                                                                                                                  Parametersdecode: bool, default=True Whether to decode unicode characters to their ascii representations.

                                                                                                                                                                                  lower_case: bool, default=True Whether to convert all characters to lower case.

                                                                                                                                                                                  drop_email: bool, default=True Whether to drop email addresses from the text.

                                                                                                                                                                                  regex_email: str, default=None Regex used to search for email addresses. If None, it uses r\"[\\w.-]+@[\\w-]+\\.[\\w.-]+\".

                                                                                                                                                                                  drop_url: bool, default=True Whether to drop URL links from the text.

                                                                                                                                                                                  regex_url: str, default=None Regex used to search for URLs. If None, it uses r\"https?://\\S+|www\\.\\S+\".

                                                                                                                                                                                  drop_html: bool, default=True Whether to drop HTML tags from the text. This option is particularly useful if the data was scraped from a website.

                                                                                                                                                                                  regex_html: str, default=None Regex used to search for html tags. If None, it uses r\"<.*?>\".

                                                                                                                                                                                  drop_emoji: bool, default=True Whether to drop emojis from the text.

                                                                                                                                                                                  regex_emoji: str, default=None Regex used to search for emojis. If None, it uses r\":[a-z_]+:\".

                                                                                                                                                                                  drop_number: bool, default=True Whether to drop numbers from the text.

                                                                                                                                                                                  regex_number: str, default=None Regex used to search for numbers. If None, it uses r\"\\b\\d+\\b\". Note that numbers adjacent to letters are not removed.

                                                                                                                                                                                  drop_punctuation: bool, default=True Whether to drop punctuations from the text. Characters considered punctuation are !\"#$%&'()*+,-./:;<=>?@[\\]^_~`.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n                                                 corpus  target\n0     From: fabian@vivian.w.open.de (Fabian Hoppe)\\n...       1\n1     From: nyeda@cnsvax.uwec.edu (David Nye)\\nSubje...       0\n2     From: urathi@net4.ICS.UCI.EDU (Unmesh Rathi)\\n...       1\n3     From: inoue@crd.yokogawa.co.jp (Inoue Takeshi)...       1\n4     From: sandvik@newton.apple.com (Kent Sandvik)\\...       0\n...                                                 ...     ...\n1662  From: kutluk@ccl.umist.ac.uk (Kutluk Ozguven)\\...       0\n1663  From: dmp1@ukc.ac.uk (D.M.Procida)\\nSubject: R...       2\n1664  From: tdunbar@vtaix.cc.vt.edu (Thomas Dunbar)\\...       1\n1665  From: dmp@fig.citib.com (Donna M. Paino)\\nSubj...       2\n1666  From: cdm@pmafire.inel.gov (Dale Cook)\\nSubjec...       2\n\n[1667 rows x 2 columns]\n\n\n>>> atom.textclean(verbose=2)\n\nFitting TextCleaner...\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n\n\n>>> print(atom.dataset)\n\n                                                 corpus  target\n0     from  fabian hoppe\\nsubject searching cadsoftw...       1\n1     from  david nye\\nsubject re after  years can w...       0\n2     from  unmesh rathi\\nsubject motif and intervie...       1\n3     from  inoue takeshi\\nsubject how to see charac...       1\n4     from  kent sandvik\\nsubject re slavery was re ...       0\n...                                                 ...     ...\n1662  from  kutluk ozguven\\nsubject re jewish settle...       0\n1663  from  dmprocida\\nsubject re homeopathy a respe...       2\n1664  from  thomas dunbar\\nsubject re x toolkits\\nsu...       1\n1665  from  donna m paino\\nsubject psoriatic arthrit...       2\n1666  from  dale cook\\nsubject re morbus meniere  is...       2\n\n[1667 rows x 2 columns]\n
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom.nlp import TextCleaner\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> textcleaner = TextCleaner(verbose=2)\n>>> X = textcleaner.transform(X)\n\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n\n\n>>> print(X)\n\n                                                 corpus\n0     from  mark a deloura\\nsubject looking for x wi...\n1     from  der mouse\\nsubject re creating  bit wind...\n2     from  keith m ryan\\nsubject re where are they ...\n3     from  steven grimm\\nsubject re opinions on all...\n4     from  peter kaminski\\nsubject re krillean phot...\n...                                                 ...\n1662  from donald mackie \\nsubject re seeking advice...\n1663  from  gordon banks\\nsubject re update help was...\n1664  from  keith m ryan\\nsubject re political athei...\n1665  from  benedikt rosenau\\nsubject re biblical ra...\n1666  from derrick j brashear \\nsubject mouseless op...\n\n[1667 rows x 1 columns]\n
                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Apply the transformations to the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/", "title": "TextNormalizer", "text": "

                                                                                                                                                                                  class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]Normalize the corpus.

                                                                                                                                                                                  Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

                                                                                                                                                                                  This class can be accessed from atom through the textnormalize method. Read more in the user guide.

                                                                                                                                                                                  Parametersstopwords: bool or str, default=True Whether to remove a predefined dictionary of stopwords.

                                                                                                                                                                                  • If False: Don't remove any predefined stopwords.
                                                                                                                                                                                  • If True: Drop predefined english stopwords from the text.
                                                                                                                                                                                  • If str: Language from nltk.corpus.stopwords.words.

                                                                                                                                                                                  custom_stopwords: sequence or None, default=None Custom stopwords to remove from the text.

                                                                                                                                                                                  stem: bool or str, default=False Whether to apply stemming using SnowballStemmer.

                                                                                                                                                                                  • If False: Don't apply stemming.
                                                                                                                                                                                  • If True: Apply stemmer based on the english language.
                                                                                                                                                                                  • If str: Language from SnowballStemmer.languages.

                                                                                                                                                                                  lemmatize: bool, default=True Whether to apply lemmatization using WordNetLemmatizer.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.textnormalize(stopwords=\"english\", lemmatize=True, verbose=2)\n\nFitting TextNormalizer...\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n\n\n>>> print(atom.dataset)\n\n                           corpus  target\n0                     [new, york]       0\n1              [another, line...]       1\n2               [New, york, nice]       0\n3  [new, york, large, washington]       1\n4                     [run, test]       0\n5             [I, \u00e0m, ne'w, york]       1\n6                          [test]       0\n7                     [hi, test!]       1\n
                                                                                                                                                                                  >>> from atom.nlp import TextNormalizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> textnormalizer = TextNormalizer(\n...     stopwords=\"english\",\n...     lemmatize=True,\n...     verbose=2,\n... )\n>>> X = textnormalizer.transform(X)\n\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n\n\n>>> print(X)\n\n                           corpus\n0             [I, \u00e0m, ne'w, york]\n1               [New, york, nice]\n2                     [new, york]\n3                     [hi, test!]\n4              [another, line...]\n5  [new, york, large, washington]\n6                     [run, test]\n7                          [test]\n
                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformNormalize the text.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Normalize the text.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/", "title": "Tokenizer", "text": "

                                                                                                                                                                                  class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]Tokenize the corpus.

                                                                                                                                                                                  Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  This class can be accessed from atom through the tokenize method. Read more in the user guide.

                                                                                                                                                                                  Parametersbigram_freq: int, float or None, default=None Frequency threshold for bigram creation.

                                                                                                                                                                                  • If None: Don't create any bigrams.
                                                                                                                                                                                  • If int: Minimum number of occurrences to make a bigram.
                                                                                                                                                                                  • If float: Minimum frequency fraction to make a bigram.

                                                                                                                                                                                  trigram_freq: int, float or None, default=None Frequency threshold for trigram creation.

                                                                                                                                                                                  • If None: Don't create any trigrams.
                                                                                                                                                                                  • If int: Minimum number of occurrences to make a trigram.
                                                                                                                                                                                  • If float: Minimum frequency fraction to make a trigram.

                                                                                                                                                                                  quadgram_freq: int, float or None, default=None Frequency threshold for quadgram creation.

                                                                                                                                                                                  • If None: Don't create any quadgrams.
                                                                                                                                                                                  • If int: Minimum number of occurrences to make a quadgram.
                                                                                                                                                                                  • If float: Minimum frequency fraction to make a quadgram.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  Attributesbigrams_: pd.DataFrame Created bigrams and their frequencies.

                                                                                                                                                                                  trigrams_: pd.DataFrame Created trigrams and their frequencies.

                                                                                                                                                                                  quadgrams_: pd.DataFrame Created quadgrams and their frequencies.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.tokenize(verbose=2)\n\nFitting Tokenizer...\nTokenizing the corpus...\n\n\n>>> print(atom.dataset)\n\n                                      corpus  target\n0                                [new, york]       0\n1                       [another, line, ...]       1\n2                      [New, york, is, nice]       0\n3  [new, york, is, larger, than, washington]       1\n4                       [running, the, test]       0\n5                [I, \u00e0m, in, ne, ', w, york]       1\n6                        [this, is, a, test]       0\n7          [hi, there, this, is, a, test, !]       1\n
                                                                                                                                                                                  >>> from atom.nlp import Tokenizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> tokenizer = Tokenizer(bigram_freq=2, verbose=2)\n>>> X = tokenizer.transform(X)\n\nTokenizing the corpus...\n --> Creating 5 bigrams on 10 locations.\n\n\n>>> print(X)\n\n                                     corpus\n0               [I, \u00e0m, in, ne, ', w, york]\n1                      [New, york_is, nice]\n2                                [new_york]\n3           [hi, there, this_is, a_test, !]\n4                      [another, line, ...]\n5  [new, york_is, larger, than, washington]\n6                      [running, the, test]\n7                         [this_is, a_test]\n
                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTokenize the text.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Tokenize the text.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/", "title": "Vectorizer", "text": "

                                                                                                                                                                                  class atom.nlp.Vectorizer(strategy=\"bow\", return_sparse=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Vectorize text data.

                                                                                                                                                                                  Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

                                                                                                                                                                                  This class can be accessed from atom through the vectorize method. Read more in the user guide.

                                                                                                                                                                                  Parametersstrategy: str, default=\"bow\" Strategy with which to vectorize the text. Choose from:

                                                                                                                                                                                  • \"bow\": Bag of Words.
                                                                                                                                                                                  • \"tfidf\": Term Frequency - Inverse Document Frequency.
                                                                                                                                                                                  • \"hashing\": Vectorize to a matrix of token occurrences.

                                                                                                                                                                                  return_sparse: bool, default=True Whether to return the transformation output as a dataframe of sparse arrays. Must be False when there are other columns in X (besides corpus) that are non-sparse.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                  Attributes[strategy]_: sklearn transformer Estimator instance (lowercase strategy) used to vectorize the corpus, e.g., vectorizer.tfidf for the tfidf strategy.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.vectorize(strategy=\"tfidf\", verbose=2)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n>>> print(atom.dataset)\n\n   corpus_another  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_washington  corpus_york  corpus_\u00e0m  target\n0        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.759339     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.650696   0.000000       0\n1        0.707107   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.000000   0.000000       1\n2        0.000000   0.000000   0.518242       0.000000     0.000000   0.000000    0.437535     0.631991         0.00000     0.000000     0.000000     0.00000           0.000000     0.374934   0.000000       0\n3        0.000000   0.000000   0.386401       0.471212     0.000000   0.000000    0.326226     0.000000         0.00000     0.000000     0.471212     0.00000           0.471212     0.279551   0.000000       1\n4        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000         0.57735     0.577350     0.000000     0.57735           0.000000     0.000000   0.000000       0\n5        0.000000   0.546199   0.000000       0.000000     0.000000   0.546199    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.324037   0.546199       1\n6        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       0\n7        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       1\n
                                                                                                                                                                                  >>> from atom.nlp import Vectorizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> vectorizer = Vectorizer(strategy=\"tfidf\", verbose=2)\n>>> X = vectorizer.fit_transform(X)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n>>> print(X)\n\n   corpus_another  corpus_hi  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_there  corpus_this  corpus_washington  corpus_york  corpus_\u00e0m\n0        0.000000   0.000000   0.542162   0.000000       0.000000     0.000000   0.542162    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.343774   0.542162\n1        0.000000   0.000000   0.000000   0.415657       0.000000     0.000000   0.000000    0.474072     0.655527        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.415657   0.000000\n2        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.751913     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.659262   0.000000\n3        0.000000   0.525049   0.000000   0.332923       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.379712     0.000000    0.000000      0.525049     0.440032           0.000000     0.000000   0.000000\n4        0.707107   0.000000   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.000000   0.000000\n5        0.000000   0.000000   0.000000   0.304821       0.480729     0.000000   0.000000    0.347660     0.000000        0.000000     0.000000     0.480729    0.000000      0.000000     0.000000           0.480729     0.304821   0.000000\n6        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000        0.629565     0.455297     0.000000    0.629565      0.000000     0.000000           0.000000     0.000000   0.000000\n7        0.000000   0.000000   0.000000   0.497041       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.566893     0.000000    0.000000      0.000000     0.656949           0.000000     0.000000   0.000000\n
                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformVectorize the text.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Vectorize the text.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/", "title": "Pipeline", "text": "

                                                                                                                                                                                  class atom.pipeline.Pipeline(steps, memory=None, verbose=0)[source]Pipeline of transforms with a final estimator.

                                                                                                                                                                                  Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be transformsers, that is, they must implement fit and transform methods. The final estimator only needs to implement fit. The transformers in the pipeline can be cached using the memory parameter.

                                                                                                                                                                                  The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by __, as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to passthrough or None.

                                                                                                                                                                                  Read more in sklearn's the user guide.

                                                                                                                                                                                  Info

                                                                                                                                                                                  This class behaves similarly to sklearn's pipeline, and additionally:

                                                                                                                                                                                  • Works with an empty pipeline.
                                                                                                                                                                                  • Accepts transformers that drop rows.
                                                                                                                                                                                  • Accepts transformers that only are fitted on a subset of the provided dataset.
                                                                                                                                                                                  • Accepts transformers that apply only on the target column.
                                                                                                                                                                                  • Uses transformers that are only applied on the training set to fit the pipeline, not to make predictions on new data.
                                                                                                                                                                                  • The instance is considered fitted at initialization if all the underlying transformers/estimator in the pipeline are.
                                                                                                                                                                                  • It returns attributes from the final estimator if they are not of the Pipeline.
                                                                                                                                                                                  • The last transformer is also cached.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This Pipeline only works with estimators whose parameters for fit, transform, predict, etc... are named X and/or y.

                                                                                                                                                                                  Parameterssteps: list of tuple List of (name, transform) tuples (implementing fit/transform) that are chained in sequential order.

                                                                                                                                                                                  memory: str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute named_steps or steps to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time-consuming.

                                                                                                                                                                                  verbose: int or None, default=0 Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. If >0, the time elapsed while fitting each step is printed.

                                                                                                                                                                                  Attributesnamed_steps: Bunch Dictionary-like object, with the following attributes. Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters.

                                                                                                                                                                                  classes_: np.ndarray of shape (n_classes,) The class' labels. Only exist if the last step of the pipeline is a classifier.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during first step fit method.

                                                                                                                                                                                  n_features_in_: int Number of features seen during first step fit method.

                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 165 (1.2%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n>>> atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 22 features from the dataset.\n   --> Dropping feature mean texture (rank 2).\n   --> Dropping feature mean smoothness (rank 3).\n   --> Dropping feature mean symmetry (rank 9).\n   --> Dropping feature texture error (rank 7).\n   --> Dropping feature smoothness error (rank 4).\n   --> Dropping feature concavity error (rank 5).\n   --> Dropping feature worst compactness (rank 8).\n   --> Dropping feature worst fractal dimension (rank 6).\n\n\n>>> # Train models\n>>> atom.run(models=\"LR\")\n\n\nTraining ========================= >>\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9787\nTime elapsed: 0.030s\n-------------------------------------------------\nTime: 0.030s\n\n\nFinal results ==================== >>\nTotal time: 0.033s\n-------------------------------------\nLogisticRegression --> f1: 0.9787\n\n\n>>> # Get the pipeline and make predictions\n>>> pl = atom.lr.export_pipeline()\n>>> print(pl.predict(X))\n\n[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1\n 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1\n 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1\n 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0\n 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1\n 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0\n 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1\n 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0\n 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1\n 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1\n 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1\n 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0\n 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 1 1 1 0 0 0 0 0 0 1]\n
                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  decision_functionTransform, then decision_function of the final estimator.fitFit the pipeline.fit_predictTransform the data, and apply fit_predict with the final estimator.fit_transformFit the pipeline and transform the data.get_feature_names_outGet output feature names for transformation.get_paramsGet parameters for this estimator.inverse_transformInverse transform for each step in a reverse order.predictTransform, then predict of the final estimator.predict_log_probaTransform, then predict_log_proba of the final estimator.predict_probaTransform, then predict_proba of the final estimator.scoreTransform, then score of the final estimator.score_samplesTransform the data, and apply score_samples with the final estimator.set_outputSet the output container when \"transform\" and \"fit_transform\" are called.set_paramsSet the parameters of this estimator.transformTransform the data.

                                                                                                                                                                                  method decision_function(X)[source]Transform, then decision_function of the final estimator.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  Returnsnp.ndarray Predicted confidence scores.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Fit the pipeline.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, dict, sequence or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_predict(X, y=None, **fit_params)[source]Transform the data, and apply fit_predict with the final estimator.

                                                                                                                                                                                  ParametersX : iterable Training data. Must fulfill input requirements of first step of the pipeline.

                                                                                                                                                                                  y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline.

                                                                                                                                                                                  **fit_params : dict of string -> object Parameters passed to the fit method of each step, where each parameter name is prefixed such that parameter p for step s has key s__p.

                                                                                                                                                                                  Returnsy_pred : ndarray Result of calling fit_predict on the final estimator.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit the pipeline and transform the data.

                                                                                                                                                                                  Call fit followed by transform on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the transform method. Only valid if the final estimator implements transform. This also works when the final estimator is None, in which case all prior transformations are applied.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the estimator only uses y.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_feature_names_out(input_features=None)[source]Get output feature names for transformation.

                                                                                                                                                                                  Parametersinput_features : array-like of str or None, default=None Input features.

                                                                                                                                                                                  Returnsfeature_names_out : ndarray of str objects Transformed feature names.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : mapping of string to any Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Inverse transform for each step in a reverse order.

                                                                                                                                                                                  All estimators in the pipeline must implement the inverse_transform method.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method predict(X, **predict_params)[source]Transform, then predict of the final estimator.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  **predict_params Additional keyword arguments for the predict method. Note that while this may be used to return uncertainties from some models with return_std or return_cov, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator.

                                                                                                                                                                                  Returnsnp.ndarray Predicted classes with shape=(n_samples,).

                                                                                                                                                                                  method predict_log_proba(X)[source]Transform, then predict_log_proba of the final estimator.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  Returnsnp.ndarray Predicted class log-probabilities.

                                                                                                                                                                                  method predict_proba(X)[source]Transform, then predict_proba of the final estimator.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  Returnsnp.ndarray Predicted class probabilities.

                                                                                                                                                                                  method score(X, y, sample_weight=None)[source]Transform, then score of the final estimator.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, dict, sequence

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                  Returnsfloat Mean accuracy or r2 of self.predict(X) with respect to y.

                                                                                                                                                                                  method score_samples(X)[source]Transform the data, and apply score_samples with the final estimator.

                                                                                                                                                                                  ParametersX : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline.

                                                                                                                                                                                  Returnsy_score : ndarray of shape (n_samples,) Result of calling score_samples on the final estimator.

                                                                                                                                                                                  method set_output(transform=None)[source]Set the output container when \"transform\" and \"fit_transform\" are called.

                                                                                                                                                                                  Parameterstransform : {\"default\", \"pandas\"}, default=None Configure output of transform and fit_transform.

                                                                                                                                                                                  • \"default\": Default output format of a transformer
                                                                                                                                                                                  • \"pandas\": DataFrame output
                                                                                                                                                                                  • None: Transform configuration is unchanged

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method set_params(**kwargs)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**kwargs : dict Parameters of this estimator or parameters of estimators contained in steps. Parameters of the steps may be set using its name and the parameter name separated by a '__'.

                                                                                                                                                                                  Returnsself : object Pipeline class instance.

                                                                                                                                                                                  method transform(X=None, y=None, **kwargs)[source]Transform the data.

                                                                                                                                                                                  Call transform on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the transform method. Only valid if the final estimator implements transform. This also works when the final estimator is None, in which case all prior transformations are applied.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y.

                                                                                                                                                                                  y: int, str, dict, sequence or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the _iter inner method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_calibration/", "title": "plot_calibration", "text": "

                                                                                                                                                                                  method plot_calibration(models=None, rows=\"test\", n_bins=10, target=0, title=None, legend=\"upper left\", figsize=(900, 900), filename=None, display=True)[source]Plot the calibration curve for a binary classifier.

                                                                                                                                                                                  Well-calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance, a calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation.

                                                                                                                                                                                  This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e., the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. This plot is available only for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the calibrate method to calibrate the winning model.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  n_bins: int, default=10 Number of bins used for calibration. Minimum of 5 required.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_calibration/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"RF\", \"LGB\"])\n>>> atom.plot_calibration()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_components/", "title": "plot_components", "text": "

                                                                                                                                                                                  method plot_components(show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the explained variance ratio per component.

                                                                                                                                                                                  Kept components are colored and discarded components are transparent. This plot is available only when feature selection was applied with strategy=\"pca\".

                                                                                                                                                                                  Parametersshow: int or None, default=None Number of components to show. None to show all.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of components shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_pca Plot the explained variance ratio vs number of components.

                                                                                                                                                                                  plot_rfecv Plot the rfecv results.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_components/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"pca\", n_features=5)\n>>> atom.plot_components(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_confusion_matrix/", "title": "plot_confusion_matrix", "text": "

                                                                                                                                                                                  method plot_confusion_matrix(models=None, rows=\"test\", target=0, threshold=0.5, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot a model's confusion matrix.

                                                                                                                                                                                  For one model, the plot shows a heatmap. For multiple models, it compares TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). This plot is available only for classification tasks.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Fill the threshold parameter with the result from the model's get_best_threshold method to optimize the results.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the confusion matrix.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                  threshold: float, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only for binary classification tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_calibration Plot the calibration curve for a binary classifier.

                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_confusion_matrix/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, test_size=0.4)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.lr.plot_confusion_matrix()  # For one model\n
                                                                                                                                                                                  >>> atom.plot_confusion_matrix()  # For multiple models\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_correlation/", "title": "plot_correlation", "text": "

                                                                                                                                                                                  method plot_correlation(columns=None, method=\"pearson\", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]Plot a correlation matrix.

                                                                                                                                                                                  Displays a heatmap showing the correlation between columns in the dataset. The colors red, blue and white stand for positive, negative, and no correlation respectively.

                                                                                                                                                                                  Parameterscolumns: segment, sequence, dataframe or None, default=None Columns to plot. If None, plot all columns in the dataset. Selected categorical columns are ignored.

                                                                                                                                                                                  method: str, default=\"pearson\" Method of correlation. Choose from: pearson, kendall or spearman.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple, default=(800, 700) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_correlation/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_correlation()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_det/", "title": "plot_det", "text": "

                                                                                                                                                                                  method plot_det(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                  Read more about DET in sklearn's documentation. Only available for binary classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_det/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_det()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_distribution/", "title": "plot_distribution", "text": "

                                                                                                                                                                                  method plot_distribution(columns=0, distributions=\"kde\", show=None, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot column distributions.

                                                                                                                                                                                  • For numerical columns, plot the probability density distribution. Additionally, it's possible to plot any of scipy.stats distributions fitted to the column.
                                                                                                                                                                                  • For categorical columns, plot the class distribution. Only one categorical column can be plotted at the same time.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom's distribution method to check which distribution fits the column best.

                                                                                                                                                                                  Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. It's only possible to plot one categorical column. If more than one categorical column is selected, all categorical columns are ignored.

                                                                                                                                                                                  distributions: str, sequence or None, default=\"kde\" Distributions to fit. Only for numerical columns.

                                                                                                                                                                                  • If None: No distribution is fit.
                                                                                                                                                                                  • If \"kde\": Fit a Gaussian kde distribution.
                                                                                                                                                                                  • Else: Name of a scipy.stats distribution.

                                                                                                                                                                                  show: int or None, default=None Number of classes (ordered by number of occurrences) to show in the plot. If None, it shows all classes. Only for categorical columns.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None: No title is shown.
                                                                                                                                                                                  • If str: Text for the title.
                                                                                                                                                                                  • If dict: title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_distribution/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a categorical feature\n>>> animals = [\"cat\", \"dog\", \"bird\", \"lion\", \"zebra\"]\n>>> probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]\n>>> X[\"animals\"] = np.random.choice(animals, size=len(X), p=probabilities)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_distribution(columns=[0, 1])\n
                                                                                                                                                                                  >>> atom.plot_distribution(columns=0, distributions=[\"norm\", \"invgauss\"])\n
                                                                                                                                                                                  >>> atom.plot_distribution(columns=\"animals\")\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_edf/", "title": "plot_edf", "text": "

                                                                                                                                                                                  method plot_edf(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                  Use this plot to analyze and improve hyperparameter search spaces. The EDF assumes that the value of the objective function is in accordance with the uniform distribution over the objective space. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Note

                                                                                                                                                                                  Only complete trials are considered when plotting the EDF.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_edf/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from optuna.distributions import IntDistribution\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n\n>>> # Run three models with different search spaces\n>>> atom.run(\n...     models=\"RF_1\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(6, 10)}},\n... )\n>>> atom.run(\n...     models=\"RF_2\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(11, 15)}},\n... )\n>>> atom.run(\n...     models=\"RF_3\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(16, 20)}},\n... )\n\n>>> atom.plot_edf()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_errors/", "title": "plot_errors", "text": "

                                                                                                                                                                                  method plot_errors(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's prediction errors.

                                                                                                                                                                                  Plot the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This plot can be useful to detect noise or heteroscedasticity along a range of the target domain. This plot is available only for regression tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_residuals Plot a model's residuals.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_errors/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run([\"OLS\", \"LGB\"])\n>>> atom.plot_errors()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_evals/", "title": "plot_evals", "text": "

                                                                                                                                                                                  method plot_evals(models=None, dataset=\"test\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot evaluation curves.

                                                                                                                                                                                  The evaluation curves are the main metric scores achieved by the models at every iteration of the training process. This plot is available only for models that allow in-training validation.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  dataset: str, default=\"test\" Data set for which to plot the evaluation curves. Use + between options to select more than one. Choose from: \"train\", \"test\".

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_evals/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"XGB\", \"LGB\"])\n>>> atom.plot_evals()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_feature_importance/", "title": "plot_feature_importance", "text": "

                                                                                                                                                                                  method plot_feature_importance(models=None, show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot a model's feature importance.

                                                                                                                                                                                  The sum of importances for all features (per model) is 1. This plot is available only for models whose estimator has a scores_, feature_importances_ or coef attribute.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_feature_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_feature_importance(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_forecast/", "title": "plot_forecast", "text": "

                                                                                                                                                                                  method plot_forecast(models=None, fh=\"test\", X=None, target=0, plot_interval=True, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a time series with model forecasts.

                                                                                                                                                                                  This plot is only available for forecasting tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. If no models are selected, only the target column is plotted.

                                                                                                                                                                                  fh: hashable, segment, sequence or ForecastingHorizon, default=\"test\" Forecast horizon for which to plot the predictions.

                                                                                                                                                                                  X: dataframe-like or None, default=None Exogenous time series corresponding to fh. This parameter is ignored if fh is a data set.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multivariate tasks.

                                                                                                                                                                                  plot_interval: bool, default=True Whether to plot prediction intervals instead of the exact prediction values. If True, the plotted estimators should have a predict_interval method.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_forecast/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.plot_forecast()\n
                                                                                                                                                                                  >>> atom.run(\n...     models=\"arima\",\n...     est_params={\"order\": (1, 1, 0), \"seasonal_order\": (0, 1, 0, 12)},\n... )\n>>> atom.plot_forecast()\n
                                                                                                                                                                                  >>> atom.plot_forecast(fh=\"train+test\", plot_interval=False)\n
                                                                                                                                                                                  >>> # Forecast the next 4 years starting from the test set\n>>> atom.plot_forecast(fh=range(1, 48))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_gains/", "title": "plot_gains", "text": "

                                                                                                                                                                                  method plot_gains(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the cumulative gains curve.

                                                                                                                                                                                  This plot is available only for binary and multilabel classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_gains/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_gains()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameter_importance/", "title": "plot_hyperparameter_importance", "text": "

                                                                                                                                                                                  method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a model's hyperparameter importance.

                                                                                                                                                                                  The hyperparameter importances are calculated using the fANOVA importance evaluator. The sum of all importances for all parameters (per model) is 1. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                  show: int or None, default=None Number of hyperparameters (ordered by importance) to show. None to show all.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameter_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"ET\", \"RF\"], n_trials=10)\n>>> atom.plot_hyperparameter_importance()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameters/", "title": "plot_hyperparameters", "text": "

                                                                                                                                                                                  method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot hyperparameter relationships in a study.

                                                                                                                                                                                  A model's hyperparameters are plotted against each other. The corresponding metric scores are displayed in a contour plot. The markers are the trials in the study. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_hyperparameters().

                                                                                                                                                                                  params: str, segment or sequence, default=(0, 1) Hyperparameters to plot. Use a sequence or add + between options to select more than one.

                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_hyperparameter_importance Plot a model's hyperparameter importance.

                                                                                                                                                                                  plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameters/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\", n_trials=15)\n>>> atom.plot_hyperparameters(params=(0, 1, 2))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_learning_curve/", "title": "plot_learning_curve", "text": "

                                                                                                                                                                                  method plot_learning_curve(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the learning curve: score vs number of training samples.

                                                                                                                                                                                  This plot is available only for models fitted using train sizing. Ensembles are ignored.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                  plot_successive_halving Plot scores per iteration of the successive halving.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_learning_curve/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.train_sizing([\"LR\", \"RF\"], n_bootstrap=5)\n>>> atom.plot_learning_curve()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_lift/", "title": "plot_lift", "text": "

                                                                                                                                                                                  method plot_lift(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the lift curve.

                                                                                                                                                                                  Only available for binary classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_lift/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_lift()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_ngrams/", "title": "plot_ngrams", "text": "

                                                                                                                                                                                  method plot_ngrams(ngram=\"bigram\", rows=\"dataset\", show=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot n-gram frequencies.

                                                                                                                                                                                  The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised. If the documents are not tokenized, the words are separated by spaces.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom's tokenize method to separate the words creating n-grams based on their frequency in the corpus.

                                                                                                                                                                                  Parametersngram: str or int, default=\"bigram\" Number of contiguous words to search for (size of n-gram). Choose from: word (1), bigram (2), trigram (3), quadgram (4).

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the search.

                                                                                                                                                                                  show: int or None, default=10 Number of n-grams (ordered by number of occurrences) to show in the plot. If none, show all n-grams (up to 200).

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of n-grams shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_wordcloud Plot a wordcloud from the corpus.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_ngrams/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.textclean()\n>>> atom.textnormalize()\n>>> atom.plot_ngrams()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_parallel_coordinate/", "title": "plot_parallel_coordinate", "text": "

                                                                                                                                                                                  method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                  Every line of the plot represents one trial. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_parallel_coordinate().

                                                                                                                                                                                  params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add + between options to select more than one. If None, all the model's hyperparameters are selected.

                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                  plot_hyperparameter_importance Plot a model's hyperparameter importance.

                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_parallel_coordinate/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"RF\", n_trials=15)\n>>> atom.plot_parallel_coordinate(params=slice(1, 5))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_pareto_front/", "title": "plot_pareto_front", "text": "

                                                                                                                                                                                  method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the Pareto front of a study.

                                                                                                                                                                                  Shows the trial scores plotted against each other. The marker's colors indicate the trial number. This plot is only available for models with multi-metric runs and hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_pareto_front().

                                                                                                                                                                                  metric: str, sequence or None, default=None Metrics to plot. Use a sequence or add + between options to select more than one. If None, the metrics used to run the pipeline are selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of metrics shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                  plot_slice Plot the parameter relationship in a study.

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_pareto_front/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"accuracy\", \"recall\"],\n...     n_trials=15,\n...  )\n>>> atom.plot_pareto_front()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_parshap/", "title": "plot_parshap", "text": "

                                                                                                                                                                                  method plot_parshap(models=None, columns=None, target=1, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial correlation of shap values.

                                                                                                                                                                                  Plots the train and test correlation between the shap value of every feature with its target value, after removing the effect of all other features (partial correlation). This plot is useful to identify the features that are contributing most to overfitting. Features that lie below the bisector (diagonal line) performed worse on the test set than on the training set. If the estimator has a scores_, feature_importances_ or coef_ attribute, its normalized values are shown in a color map.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  columns: int, str, segment, sequence or None, default=None XSelector to plot. If None, it plots all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_parshap/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"RF\"])\n>>> atom.rf.plot_parshap(legend=None)\n
                                                                                                                                                                                  >>> atom.plot_parshap(columns=slice(5, 10))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_partial_dependence/", "title": "plot_partial_dependence", "text": "

                                                                                                                                                                                  method plot_partial_dependence(models=None, columns=(0, 1, 2), kind=\"average\", pair=None, target=1, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial dependence of features.

                                                                                                                                                                                  The partial dependence of a feature (or a set of features) corresponds to the response of the model for each possible value of the feature. The plot can take two forms:

                                                                                                                                                                                  • If pair is None: Single feature partial dependence lines. The deciles of the feature values are shown with tick marks on the bottom.
                                                                                                                                                                                  • If pair is defined: Two-way partial dependence plots are plotted as contour plots (only allowed for a single model).

                                                                                                                                                                                  Read more about partial dependence on sklearn's documentation. This plot is not available for multilabel nor multiclass-multioutput classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe, default=(0, 1, 2) XSelector to get the partial dependence from.

                                                                                                                                                                                  kind: str or sequence, default=\"average\" Kind of dependence to plot. Use a sequence or add + between options to select more than one. Choose from:

                                                                                                                                                                                  • \"average\": Partial dependence averaged across all samples in the dataset.
                                                                                                                                                                                  • \"individual\": Partial dependence for up to 50 random samples (Individual Conditional Expectation).

                                                                                                                                                                                  This parameter is ignored when plotting feature pairs.

                                                                                                                                                                                  pair: int, str or None, default=None Feature with which to pair the features selected by columns. If specified, the resulting figure displays contour plots. Only allowed when plotting a single model. If None, the plots show the partial dependence of single features.

                                                                                                                                                                                  target: int or str, default=1 Class in the target column to look at (only for multiclass classification tasks).

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_partial_dependence/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_partial_dependence(kind=\"average+individual\", legend=\"upper left\")\n
                                                                                                                                                                                  >>> atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_pca/", "title": "plot_pca", "text": "

                                                                                                                                                                                  method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the explained variance ratio vs number of components.

                                                                                                                                                                                  If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the selected components. The star marks the number of components selected by the user. This plot is available only when feature selection was applied with strategy=\"pca\".

                                                                                                                                                                                  Parameterstitle: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_components Plot the explained variance ratio per component.

                                                                                                                                                                                  plot_rfecv Plot the rfecv results.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_pca/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"pca\", n_features=5)\n>>> atom.plot_pca()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_permutation_importance/", "title": "plot_permutation_importance", "text": "

                                                                                                                                                                                  method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the feature permutation importance of models.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This method can be slow. Results are cached to fasten repeated calls.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  n_repeats: int, default=10 Number of times to permute each feature.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_permutation_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_permutation_importance(show=10, n_repeats=7)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_pipeline/", "title": "plot_pipeline", "text": "

                                                                                                                                                                                  method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a diagram of the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This plot uses the schemdraw package, which is incompatible with plotly. The returned plot is therefore a matplotlib figure.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models for which to draw the pipeline. If None, all pipelines are plotted.

                                                                                                                                                                                  draw_hyperparameter_tuning: bool, default=True Whether to draw if the models used Hyperparameter Tuning.

                                                                                                                                                                                  color_branches: bool or None, default=None Whether to draw every branch in a different color. If None, branches are colored when there is more than one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the pipeline drawn.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_wordcloud Plot a wordcloud from the corpus.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_pipeline/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"RNN\", \"SGD\", \"MLP\"])\n>>> atom.voting(models=atom.winners[:2])\n>>> atom.plot_pipeline()\n
                                                                                                                                                                                  >>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.scale()\n>>> atom.prune()\n>>> atom.run(\"RF\", n_trials=30)\n\n>>> atom.branch = \"undersample\"\n>>> atom.balance(\"nearmiss\")\n>>> atom.run(\"RF_undersample\")\n\n>>> atom.branch = \"oversample_from_main\"\n>>> atom.balance(\"smote\")\n>>> atom.run(\"RF_oversample\")\n\n>>> atom.plot_pipeline()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_prc/", "title": "plot_prc", "text": "

                                                                                                                                                                                  method plot_prc(models=None, rows=\"test\", target=0, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot the precision-recall curve.

                                                                                                                                                                                  Read more about PRC in sklearn's documentation. Only available for binary classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_prc/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_prc()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_probabilities/", "title": "plot_probabilities", "text": "

                                                                                                                                                                                  method plot_probabilities(models=None, rows=\"test\", target=1, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the probability distribution of the target classes.

                                                                                                                                                                                  This plot is available only for models with a predict_proba method in classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  target: int, str or tuple, default=1 Probability of being that class in the target column. For multioutput tasks, the value should be a tuple of the form (column, class).

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_probabilities/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_probabilities()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_qq/", "title": "plot_qq", "text": "

                                                                                                                                                                                  method plot_qq(columns=0, distributions=\"norm\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a quantile-quantile plot.

                                                                                                                                                                                  Columns are distinguished by color and the distributions are distinguished by marker type. Missing values are ignored.

                                                                                                                                                                                  Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. Selected categorical columns are ignored.

                                                                                                                                                                                  distributions: str or sequence, default=\"norm\" Names of the scipy.stats distributions to fit to the columns.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_qq/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_qq(columns=[5, 6])\n
                                                                                                                                                                                  >>> atom.plot_qq(columns=0, distributions=[\"norm\", \"invgauss\", \"triang\"])\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_relationships/", "title": "plot_relationships", "text": "

                                                                                                                                                                                  method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]Plot pairwise relationships in a dataset.

                                                                                                                                                                                  Creates a grid of axes such that each numerical column appears once on the x-axes and once on the y-axes. The bottom triangle contains scatter plots (max 250 random samples), the diagonal plots contain column distributions, and the upper triangle contains contour histograms for all samples in the columns.

                                                                                                                                                                                  Parameterscolumns: segment, sequence or dataframe, default=(0, 1, 2) Columns to plot. Selected categorical columns are ignored.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_relationships/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_relationships(columns=[0, 4, 5])\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_residuals/", "title": "plot_residuals", "text": "

                                                                                                                                                                                  method plot_residuals(models=None, rows=\"test\", target=0, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's residuals.

                                                                                                                                                                                  The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the regressor's errors. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. This plot is only available for regression tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_errors Plot a model's prediction errors.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_residuals/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run([\"OLS\", \"LGB\"])\n>>> atom.plot_residuals()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_results/", "title": "plot_results", "text": "

                                                                                                                                                                                  method plot_results(models=None, metric=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the model results.

                                                                                                                                                                                  If all models applied bootstrap, the plot is a boxplot. If not, the plot is a barplot. Models are ordered based on their score from the top down. The score is either the [metric]_bootstrap or [metric]_test values, selected in that order.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Other available options are: \"time_bo\", \"time_fit\", \"time_bootstrap\", \"time\". If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of models.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                  plot_probabilities Plot the probability distribution of the target classes.

                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_results/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"])\n>>> atom.plot_results()\n
                                                                                                                                                                                  >>> atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"], n_bootstrap=5)\n>>> atom.plot_results()\n
                                                                                                                                                                                  >>> atom.plot_results(metric=\"time_fit+time\")\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_rfecv/", "title": "plot_rfecv", "text": "

                                                                                                                                                                                  method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the rfecv results.

                                                                                                                                                                                  Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy=\"rfecv\".

                                                                                                                                                                                  Parameterstitle: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_components Plot the explained variance ratio per component.

                                                                                                                                                                                  plot_pca Plot the explained variance ratio vs number of components.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_rfecv/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"rfecv\", solver=\"Tree\")\n>>> atom.plot_rfecv()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_roc/", "title": "plot_roc", "text": "

                                                                                                                                                                                  method plot_roc(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                  Read more about ROC in sklearn's documentation. Only available for classification tasks.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_roc/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_roc()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_bar/", "title": "plot_shap_bar", "text": "

                                                                                                                                                                                  method plot_shap_bar(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's bar plot.

                                                                                                                                                                                  Create a bar plot of a set of SHAP values. If a single sample is passed, then the SHAP values are plotted. If many samples are passed, then the mean absolute value for each feature column is plotted. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_bar().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_bar/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_bar(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_beeswarm/", "title": "plot_shap_beeswarm", "text": "

                                                                                                                                                                                  method plot_shap_beeswarm(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's beeswarm plot.

                                                                                                                                                                                  The plot is colored by feature values. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_beeswarm().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_beeswarm method does not support plotting a single sample.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_beeswarm/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_beeswarm(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_decision/", "title": "plot_shap_decision", "text": "

                                                                                                                                                                                  method plot_shap_decision(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's decision plot.

                                                                                                                                                                                  Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values are printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_decision().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_decision/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_decision(show=10)\n
                                                                                                                                                                                  >>> atom.plot_shap_decision(rows=-1, show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_force/", "title": "plot_shap_force", "text": "

                                                                                                                                                                                  method plot_shap_force(models=None, rows=\"test\", target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]Plot SHAP's force plot.

                                                                                                                                                                                  Visualize the given SHAP values with an additive force layout. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only a single sample is plotted). Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_force().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=(900, 300) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure (only if matplotlib=True in kwargs).

                                                                                                                                                                                  **kwargs Additional keyword arguments for shap.plots.force.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_force/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_force(rows=-2, matplotlib=True, figsize=(1800, 300))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_heatmap/", "title": "plot_shap_heatmap", "text": "

                                                                                                                                                                                  method plot_shap_heatmap(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's heatmap plot.

                                                                                                                                                                                  This plot is designed to show the population substructure of a dataset using supervised clustering and a heatmap. Supervised clustering involves clustering data points not by their original feature values but by their explanations. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_heatmap().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_heatmap method does not support plotting a single sample.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                  plot_shap_waterfall Plot SHAP's waterfall plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_heatmap/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_heatmap(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_scatter/", "title": "plot_shap_scatter", "text": "

                                                                                                                                                                                  method plot_shap_scatter(models=None, rows=\"test\", columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot SHAP's scatter plot.

                                                                                                                                                                                  Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_scatter().

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_scatter method does not support plotting a single sample.

                                                                                                                                                                                  columns: int or str, default=0 Column to plot.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_scatter/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_scatter(columns=\"symmetry error\")\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_waterfall/", "title": "plot_shap_waterfall", "text": "

                                                                                                                                                                                  method plot_shap_waterfall(models=None, rows=0, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's waterfall plot.

                                                                                                                                                                                  The SHAP value of a feature represents the impact of the evidence provided by that feature on the model\u2019s output. The waterfall plot is designed to visually display how the SHAP values (evidence) of each feature move the model output from our prior expectation under the background data distribution, to the final model prediction given the evidence of all the features. Features are sorted by the magnitude of their SHAP values with the smallest magnitude features grouped together at the bottom of the plot when the number of features in the models exceeds the show parameter. Read more about SHAP plots in the user guide.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_waterfall().

                                                                                                                                                                                  rows: int or str, default=0 Selection of rows to plot. The plot_shap_waterfall method does not support plotting multiple samples.

                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                  plot_shap_heatmap Plot SHAP's heatmap plot.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_waterfall/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_waterfall(show=10)\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_slice/", "title": "plot_slice", "text": "

                                                                                                                                                                                  method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the parameter relationship in a study.

                                                                                                                                                                                  The color of the markers indicates the trial. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_slice().

                                                                                                                                                                                  params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add + between options to select more than one. If None, all the model's hyperparameters are selected.

                                                                                                                                                                                  metric: int or str, default=None Metric to plot (only for multi-metric runs). If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                  plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_slice/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"recall\"],\n...     n_trials=15,\n... )\n>>> atom.plot_slice(params=(0, 1, 2))\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_successive_halving/", "title": "plot_successive_halving", "text": "

                                                                                                                                                                                  method plot_successive_halving(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot scores per iteration of the successive halving.

                                                                                                                                                                                  Only use with models fitted using successive halving. Ensembles are ignored.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_learning_curve Plot the learning curve: score vs number of training samples.

                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_successive_halving/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.successive_halving([\"Tree\", \"Bag\", \"RF\", \"LGB\"], n_bootstrap=5)\n>>> atom.plot_successive_halving()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_terminator_improvement/", "title": "plot_terminator_improvement", "text": "

                                                                                                                                                                                  method plot_terminator_improvement(models=None, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the potentials for future objective improvement.

                                                                                                                                                                                  This function visualizes the objective improvement potentials. It helps to determine whether you should continue the optimization or not. The evaluated error is also plotted. Note that this function may take some time to compute the improvement potentials. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The plot_terminator_improvement method is only available for models that ran hyperparameter tuning using cross-validation, e.g., using ht_params={'cv': 5}.
                                                                                                                                                                                  • This method does not support [multi-objective optimizations][multi-metric runs].
                                                                                                                                                                                  • The calculation of the improvement can be slow. Set the memory parameter to cache the results and speed up repeated calls.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y)

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_pareto_front Plot the Pareto front of a study.

                                                                                                                                                                                  plot_timeline Plot the timeline of a study.

                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_terminator_improvement/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"RF\", n_trials=10, ht_params={\"cv\": 5})\n>>> atom.plot_terminator_improvement()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_threshold/", "title": "plot_threshold", "text": "

                                                                                                                                                                                  method plot_threshold(models=None, metric=None, rows=\"test\", target=0, steps=100, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot metric performances against threshold values.

                                                                                                                                                                                  This plot is available only for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric to plot. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred), a scorer object or a sequence of these. Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                  steps: int, default=100 Number of thresholds measured.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_calibration Plot the calibration curve for a binary classifier.

                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                  plot_probabilities Plot the probability distribution of the target classes.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_threshold/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_threshold()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_timeline/", "title": "plot_timeline", "text": "

                                                                                                                                                                                  method plot_timeline(models=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the timeline of a study.

                                                                                                                                                                                  This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y)

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                  plot_slice Plot the parameter relationship in a study.

                                                                                                                                                                                  plot_terminator_improvement Plot the potentials for future objective improvement.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_timeline/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from optuna.pruners import PatientPruner\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"LGB\",\n...     n_trials=15,\n...     ht_params={\"pruner\": PatientPruner(None, patience=2)},\n... )\n>>> atom.plot_timeline()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_trials/", "title": "plot_trials", "text": "

                                                                                                                                                                                  method plot_trials(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 800), filename=None, display=True)[source]Plot the hyperparameter tuning trials.

                                                                                                                                                                                  Creates a figure with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. The best trial is indicated with a star. This is the same plot as produced by ht_params={\"plot\": True}. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Add + between options to select more than one. If None, all metrics are selected.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple, default=(900, 800) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_evals Plot evaluation curves.

                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_trials/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"ET\", \"RF\"], n_trials=15)\n>>> atom.plot_trials()\n
                                                                                                                                                                                  "}, {"location": "API/plots/plot_wordcloud/", "title": "plot_wordcloud", "text": "

                                                                                                                                                                                  method plot_wordcloud(rows=\"dataset\", title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]Plot a wordcloud from the corpus.

                                                                                                                                                                                  The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the wordcloud.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the Wordcloud object.

                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  plot_ngrams Plot n-gram frequencies.

                                                                                                                                                                                  plot_pipeline Plot a diagram of the pipeline.

                                                                                                                                                                                  "}, {"location": "API/plots/plot_wordcloud/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.textclean()\n>>> atom.textnormalize()\n>>> atom.plot_wordcloud()\n
                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/", "title": "DirectClassifier", "text": "

                                                                                                                                                                                  class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                  SuccessiveHalvingClassifier Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  TrainSizingClassifier Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import DirectClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = DirectClassifier(models=[\"LR\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: LR, RF\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.992\nTest evaluation --> f1: 0.9767\nTime elapsed: 0.104s\n-------------------------------------------------\nTime: 0.104s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.968\nTime elapsed: 0.204s\n-------------------------------------------------\nTime: 0.204s\n\n\nFinal results ==================== >>\nTotal time: 0.314s\n-------------------------------------\nLogisticRegression --> f1: 0.9767 !\nRandomForest       --> f1: 0.968\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n    f1_train  f1_test  time_fit      time\nLR     0.992   0.9767  0.104497  0.104497\nRF     1.000   0.9680  0.204185  0.204185\n\n\n>>> print(runner.evaluate())\n\n    accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR    0.9708  0.9976  0.9702  0.9767   0.9545  0.9374     0.9813  0.9722  0.9959\nRF    0.9591  0.9490  0.9511  0.9680   0.9381  0.9118     0.9550  0.9815  0.9511\n
                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/", "title": "DirectForecaster", "text": "

                                                                                                                                                                                  class atom.training.DirectForecaster(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                  SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  TrainSizingForecaster Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import DirectForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = DirectForecaster(models=[\"ES\", \"ETS\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: ES, ETS\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0868\nTest evaluation --> mape: -0.2018\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.041s\n-------------------------------------\nExponentialSmoothing --> mape: -0.2018 !\nETS                  --> mape: -0.202\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n     mape_train  mape_test  time_fit      time\nES      -0.0868    -0.2018  0.019017  0.019017\nETS     -0.0863    -0.2020  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n         mae    mape        mse      r2     rmse\nES  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\nETS -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\n
                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/", "title": "DirectRegressor", "text": "

                                                                                                                                                                                  class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  TrainSizingRegressor Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import DirectRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = DirectRegressor(models=[\"OLS\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: OLS, RF\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5991\nTest evaluation --> r2: 0.5765\nTime elapsed: 0.154s\n-------------------------------------------------\nTime: 0.154s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9803\nTest evaluation --> r2: 0.8803\nTime elapsed: 1.594s\n-------------------------------------------------\nTime: 1.594s\n\n\nFinal results ==================== >>\nTotal time: 1.749s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5765\nRandomForest         --> r2: 0.8803 !\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5991   0.5765  0.153989  0.153989\nRF     0.9803   0.8803  1.594449  1.594449\n\n\n>>> print(runner.evaluate())\n\n        mae          mape     mse      r2    rmse\nOLS -1.4553 -9.184808e+14 -3.4564  0.5765 -1.8591\nRF  -0.6098 -2.854782e+14 -0.9773  0.8803 -0.9886\n
                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/", "title": "SuccessiveHalvingClassifier", "text": "

                                                                                                                                                                                  class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                  DirectClassifier Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  TrainSizingClassifier Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = SuccessiveHalvingClassifier([\"LR\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR2, RF2\nSize of training set: 398 (50%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.996\nTest evaluation --> f1: 0.9677\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.228s\n-------------------------------------\nLogisticRegression --> f1: 0.9677 !\nRandomForest       --> f1: 0.9444\n\n\nRun: 1 =========================== >>\nModels: LR1\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.994\nTest evaluation --> f1: 0.9818\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== >>\nTotal time: 0.098s\n-------------------------------------\nLogisticRegression --> f1: 0.9818\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.5  LR2       0.996   0.9677  0.086078  0.086078\n     RF2       1.000   0.9444  0.137125  0.137125\n1.0  LR1       0.994   0.9818  0.094800  0.094800\n\n\n>>> print(runner.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR2    0.9591  0.9963  0.9609  0.9677   0.9375  0.9124     0.9813  0.9545  0.9937\nRF2    0.9298  0.9391  0.9308  0.9444   0.8947  0.8504     0.9623  0.9273  0.9308\nLR1    0.9766  0.9972  0.9745  0.9818   0.9643  0.9490     0.9818  0.9818  0.9952\n
                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/", "title": "SuccessiveHalvingForecaster", "text": "

                                                                                                                                                                                  class atom.training.SuccessiveHalvingForecaster(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                  DirectForecaster Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  TrainSizingForecaster Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = SuccessiveHalvingForecaster([\"ETS\", \"ES\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: mape\n\n\nRun: 0 =========================== >>\nModels: ETS2, ES2\nSize of training set: 115 (50%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0879\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0879\nTest evaluation --> mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.039s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 1 =========================== >>\nModels: ETS1\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.021s\n-------------------------------------\nETS --> mape: -0.202\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.5  ES2       -0.0879     -0.202  0.017015  0.017015\n     ETS2      -0.0879     -0.202  0.020018  0.020018\n1.0  ETS1      -0.0863     -0.202  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n          mae   mape        mse      r2     rmse\nETS2 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\nES2  -81.4483 -0.202 -8673.9309 -0.4209 -93.1339\nETS1 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\n
                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/", "title": "SuccessiveHalvingRegressor", "text": "

                                                                                                                                                                                  class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  TrainSizingRegressor Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = SuccessiveHalvingRegressor([\"OLS\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: r2\n\n\nRun: 0 =========================== >>\nModels: OLS2, RF2\nSize of training set: 1257 (50%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6083\nTest evaluation --> r2: -2.168057727555873e+23\nTime elapsed: 0.146s\n-------------------------------------------------\nTime: 0.146s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9685\nTest evaluation --> r2: 0.7924\nTime elapsed: 0.913s\n-------------------------------------------------\nTime: 0.913s\n\n\nFinal results ==================== >>\nTotal time: 1.061s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -2.168057727555873e+23 ~\nRandomForest         --> r2: 0.7924 !\n\n\nRun: 1 =========================== >>\nModels: RF1\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9802\nTest evaluation --> r2: 0.8692\nTime elapsed: 1.571s\n-------------------------------------------------\nTime: 1.571s\n\n\nFinal results ==================== >>\nTotal time: 1.573s\n-------------------------------------\nRandomForest --> r2: 0.8692\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.5  OLS2     0.6083 -2.168058e+23  0.146151  0.146151\n     RF2      0.9685  7.924000e-01  0.912829  0.912829\n1.0  RF1      0.9802  8.692000e-01  1.571428  1.571428\n\n\n>>> print(runner.evaluate())\n\n               mae          mape           mse            r2          rmse\nOLS2 -1.375810e+11 -6.979478e+14 -1.715067e+24 -2.168058e+23 -1.309606e+12\nRF2  -8.656000e-01 -3.503634e+14 -1.642300e+00  7.924000e-01 -1.281500e+00\nRF1  -6.385000e-01 -1.768080e+14 -1.034400e+00  8.692000e-01 -1.017000e+00\n
                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/", "title": "TrainSizingClassifier", "text": "

                                                                                                                                                                                  class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import TrainSizingClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = TrainSizingClassifier(models=\"LR\", verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR02\nSize of training set: 79 (20%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9899\nTest evaluation --> f1: 0.9455\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== >>\nTotal time: 0.089s\n-------------------------------------\nLogisticRegression --> f1: 0.9455\n\n\nRun: 1 =========================== >>\nModels: LR04\nSize of training set: 159 (40%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9898\nTest evaluation --> f1: 0.9727\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== >>\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --> f1: 0.9727\n\n\nRun: 2 =========================== >>\nModels: LR06\nSize of training set: 238 (60%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9936\nTest evaluation --> f1: 0.9683\nTime elapsed: 0.085s\n-------------------------------------------------\nTime: 0.085s\n\n\nFinal results ==================== >>\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --> f1: 0.9683\n\n\nRun: 3 =========================== >>\nModels: LR08\nSize of training set: 318 (80%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9901\nTest evaluation --> f1: 0.9817\nTime elapsed: 0.096s\n-------------------------------------------------\nTime: 0.096s\n\n\nFinal results ==================== >>\nTotal time: 0.099s\n-------------------------------------\nLogisticRegression --> f1: 0.9817\n\n\nRun: 4 =========================== >>\nModels: LR10\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.992\nTest evaluation --> f1: 0.9772\nTime elapsed: 0.099s\n-------------------------------------------------\nTime: 0.099s\n\n\nFinal results ==================== >>\nTotal time: 0.102s\n-------------------------------------\nLogisticRegression --> f1: 0.9772\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.2  LR02     0.9899   0.9455  0.086078  0.086078\n0.4  LR04     0.9898   0.9727  0.086078  0.086078\n0.6  LR06     0.9936   0.9683  0.085077  0.085077\n0.8  LR08     0.9901   0.9817  0.095865  0.095865\n1.0  LR10     0.9920   0.9772  0.098852  0.098852\n\n\n>>> print(runner.evaluate())\n\n      accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR02    0.9298  0.9916  0.9180  0.9455   0.8966  0.8483     0.9286  0.9630  0.9857\nLR04    0.9649  0.9971  0.9557  0.9727   0.9469  0.9248     0.9554  0.9907  0.9950\nLR06    0.9591  0.9976  0.9478  0.9683   0.9386  0.9124     0.9469  0.9907  0.9959\nLR08    0.9766  0.9963  0.9716  0.9817   0.9640  0.9497     0.9727  0.9907  0.9938\nLR10    0.9708  0.9973  0.9636  0.9772   0.9554  0.9372     0.9640  0.9907  0.9954\n
                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/", "title": "TrainSizingForecaster", "text": "

                                                                                                                                                                                  class atom.training.TrainSizingForecaster(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                  DirectForecaster Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import TrainSizingForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = TrainSizingForecaster([\"ETS\", \"ES\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: mape\n\n\nRun: 0 =========================== >>\nModels: ETS02, ES02\nSize of training set: 23 (20%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0889\nTest evaluation --> mape: -0.202\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0889\nTest evaluation --> mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.041s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 1 =========================== >>\nModels: ETS04, ES04\nSize of training set: 46 (40%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0871\nTest evaluation --> mape: -0.202\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0871\nTest evaluation --> mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.039s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 2 =========================== >>\nModels: ETS06, ES06\nSize of training set: 69 (60%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0861\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0867\nTest evaluation --> mape: -0.2016\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.038s\n-------------------------------------\nETS                  --> mape: -0.202\nExponentialSmoothing --> mape: -0.2016 !\n\n\nRun: 3 =========================== >>\nModels: ETS08, ES08\nSize of training set: 92 (80%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0842\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0845\nTest evaluation --> mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.040s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 4 =========================== >>\nModels: ETS10, ES10\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0868\nTest evaluation --> mape: -0.2018\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.040s\n-------------------------------------\nETS                  --> mape: -0.202\nExponentialSmoothing --> mape: -0.2018 !\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.2  ES02      -0.0889    -0.2020  0.017015  0.017015\n     ETS02     -0.0889    -0.2020  0.021020  0.021020\n0.4  ES04      -0.0871    -0.2020  0.018016  0.018016\n     ETS04     -0.0871    -0.2020  0.019017  0.019017\n0.6  ES06      -0.0867    -0.2016  0.017015  0.017015\n     ETS06     -0.0861    -0.2020  0.020019  0.020019\n0.8  ES08      -0.0845    -0.2020  0.018016  0.018016\n     ETS08     -0.0842    -0.2020  0.020018  0.020018\n1.0  ES10      -0.0868    -0.2018  0.018016  0.018016\n     ETS10     -0.0863    -0.2020  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n           mae    mape        mse      r2     rmse\nETS02 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES02  -81.4444 -0.2020 -8673.1766 -0.4208 -93.1299\nETS04 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES04  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS06 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES06  -81.3025 -0.2016 -8645.4416 -0.4162 -92.9809\nETS08 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES08  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS10 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES10  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\n
                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/", "title": "TrainSizingRegressor", "text": "

                                                                                                                                                                                  class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom.training import TrainSizingRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = TrainSizingRegressor(models=\"OLS\", verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: r2\n\n\nRun: 0 =========================== >>\nModels: OLS02\nSize of training set: 251 (20%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6391\nTest evaluation --> r2: -4.630208907041091e+25\nTime elapsed: 0.148s\n-------------------------------------------------\nTime: 0.148s\n\n\nFinal results ==================== >>\nTotal time: 0.149s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -4.630208907041091e+25 ~\n\n\nRun: 1 =========================== >>\nModels: OLS04\nSize of training set: 502 (40%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6137\nTest evaluation --> r2: -9.496101715653298e+22\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -9.496101715653298e+22 ~\n\n\nRun: 2 =========================== >>\nModels: OLS06\nSize of training set: 754 (60%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6086\nTest evaluation --> r2: -0.2872\nTime elapsed: 0.151s\n-------------------------------------------------\nTime: 0.151s\n\n\nFinal results ==================== >>\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -0.2872 ~\n\n\nRun: 3 =========================== >>\nModels: OLS08\nSize of training set: 1005 (80%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5986\nTest evaluation --> r2: 0.5025\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5025\n\n\nRun: 4 =========================== >>\nModels: OLS10\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5951\nTest evaluation --> r2: 0.5864\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5864\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.2  OLS02    0.6391 -4.630209e+25  0.148360  0.148360\n0.4  OLS04    0.6137 -9.496102e+22  0.149996  0.149996\n0.6  OLS06    0.6086 -2.872000e-01  0.151353  0.151353\n0.8  OLS08    0.5986  5.025000e-01  0.149508  0.149508\n1.0  OLS10    0.5951  5.864000e-01  0.149549  0.149549\n\n\n>>> print(runner.evaluate())\n\n                mae          mape           mse            r2          rmse\nOLS02 -1.004380e+12 -7.646687e+14 -3.774343e+26 -4.630209e+25 -1.942767e+13\nOLS04 -5.120843e+10 -8.663629e+14 -7.740805e+23 -9.496102e+22 -8.798184e+11\nOLS06 -1.559600e+00 -7.836450e+14 -1.049240e+01 -2.872000e-01 -3.239200e+00\nOLS08 -1.482200e+00 -8.382465e+14 -4.055100e+00  5.025000e-01 -2.013700e+00\nOLS10 -1.445900e+00 -8.224099e+14 -3.371700e+00  5.864000e-01 -1.836200e+00\n
                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v4.x.x/#version-4141", "title": "Version 4.14.1", "text": "
                                                                                                                                                                                  • Fixed an installation issue with conda.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4140", "title": "Version 4.14.0", "text": "
                                                                                                                                                                                  • Refactor of the Cleaner and Vectorizer classes.
                                                                                                                                                                                  • Refactor of the cross_validate method.
                                                                                                                                                                                  • The plot_pipeline method now supports drawing multiple pipelines.
                                                                                                                                                                                  • Renamed the Normalizer class to TextNormalizer.
                                                                                                                                                                                  • Renamed the Gauss class to Normalizer.
                                                                                                                                                                                  • Added the inverse_transform method to the Scaler, Normalizer and Cleaner classes.
                                                                                                                                                                                  • Added the winners property to the trainers (note the extra s).
                                                                                                                                                                                  • Added the feature_names_in_ and n_features_in_ attributes to transformers.
                                                                                                                                                                                  • The default value of the warnings parameter is set to False.
                                                                                                                                                                                  • Improvements for multicollinearity removal in FeatureSelector.
                                                                                                                                                                                  • Renamed default feature names to x0, x1, etc... for consistency with sklearn's API.
                                                                                                                                                                                  • Renamed component names in FeatureSelector to pca0, pca1, etc... for consistency with sklearn's API.
                                                                                                                                                                                  • Significant speed up in pipeline transformations.
                                                                                                                                                                                  • Fixed a bug where mlflow runs could be ended unexpectedly.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4131", "title": "Version 4.13.1", "text": "
                                                                                                                                                                                  • Fixed an installation issue.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4130", "title": "Version 4.13.0", "text": "
                                                                                                                                                                                  • Added GPU support. Read more in the user guide.
                                                                                                                                                                                  • Added advanced feature selection strategies.
                                                                                                                                                                                  • Added the return_sparse parameter to the Vectorizer class.
                                                                                                                                                                                  • Added the quantile hyperparameter to the Dummy model.
                                                                                                                                                                                  • The data attributes now return pandas objects where possible.
                                                                                                                                                                                  • Fixed a bug where the BO could crash after balancing the data.
                                                                                                                                                                                  • Fixed a bug where saving the FeatureGenerator class could fail for certain operators.
                                                                                                                                                                                  • Fixed a bug where the FeatureSelector class displayed the wrong output.
                                                                                                                                                                                  • Fixed a bug where the mapping attribute was not reordered.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4120", "title": "Version 4.12.0", "text": "
                                                                                                                                                                                  • Support for Python 3.10.
                                                                                                                                                                                  • New Discretizer class to bin numerical features.
                                                                                                                                                                                  • Refactor of the FeatureGenerator class.
                                                                                                                                                                                  • The mapping attribute now shows all encoded features.
                                                                                                                                                                                  • Added the sample_weight parameter to the evaluate method.
                                                                                                                                                                                  • ATOMClassifier has now a stratify parameter to split the data sets in a stratified fashion.
                                                                                                                                                                                  • Possibility to exclude hyperparameters from the BO adding ! before the name.
                                                                                                                                                                                  • Added memory usage to the stats method.
                                                                                                                                                                                  • Fixed a bug where plot_shap_decision could fail when only one row was plotted.
                                                                                                                                                                                  • Added versioning to the documentation.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4110", "title": "Version 4.11.0", "text": "
                                                                                                                                                                                  • Full support for sparse matrices. Read more in the user guide.
                                                                                                                                                                                  • The shrink method now also handles sparse features.
                                                                                                                                                                                  • Refactor of the distribution method.
                                                                                                                                                                                  • Added three new linear models: Lars, Huber and Perc.
                                                                                                                                                                                  • Dimensions can be shared across models using the key 'all' in ht_params[\"dimensions\"].
                                                                                                                                                                                  • Assign hyperparameters to tune using the predefined dimensions.
                                                                                                                                                                                  • It's now possible to tune a custom number of layers for the MLP model.
                                                                                                                                                                                  • If multiple BO calls share the best score, the one with the shortest training time is selected as winner (instead of the first).
                                                                                                                                                                                  • Fixed a bug where the BO could fail when custom dimensions where defined.
                                                                                                                                                                                  • Fixed a bug where FeatureSelector could fail after repeated calls to fit.
                                                                                                                                                                                  • Fixed a bug where FeatureGenerator didn't pass the correct data indices to its output.
                                                                                                                                                                                  • Performance improvements for the custom pipeline.
                                                                                                                                                                                  • Minor documentation fixes.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4100", "title": "Version 4.10.0", "text": "
                                                                                                                                                                                  • Added the holdout data set to have an extra way of assessing a model's performance on a completely independent dataset. Read more in the user_guide.
                                                                                                                                                                                  • Complete rework of the ensemble models.
                                                                                                                                                                                  • Support for dataframe indexing. Read more in the user guide.
                                                                                                                                                                                  • New plot_parshap plot to detect overfitting features.
                                                                                                                                                                                  • The new create_dashboard method makes analyzing the models even easier using a dashboard app.
                                                                                                                                                                                  • The plot_feature_importance plot now also accepts estimators with coefficients.
                                                                                                                                                                                  • Added the transform method for models.
                                                                                                                                                                                  • Added the threshold parameter to the evaluate method.
                                                                                                                                                                                  • The reset_predictions method is deprecated in favour of the new clear method.
                                                                                                                                                                                  • Refactor of the model's full_train method.
                                                                                                                                                                                  • The merge method is available for all trainers.
                                                                                                                                                                                  • Improvements in the trainer's pipeline.
                                                                                                                                                                                  • Training scores are now also saved to the mlflow run.
                                                                                                                                                                                  • Trying to change the data in a branch after fitting a model with it now raises an exception.
                                                                                                                                                                                  • Fixed a bug where the columns of array inputs were not ordered correctly.
                                                                                                                                                                                  • Fixed a bug where branches did not correctly act case-insensitive.
                                                                                                                                                                                  • Fixed a bug where the export_pipeline method for models would not export the transformers in the correct branch.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-491", "title": "Version 4.9.1", "text": "
                                                                                                                                                                                  • Changed the default cross-validation for hyperparameter tuning from 5 to 1 to avoid errors with deep learning models.
                                                                                                                                                                                  • Added clearer exception messages when a model's run failed.
                                                                                                                                                                                  • Fixed a bug where custom dimensions didn't show during hyperparameter tuning.
                                                                                                                                                                                  • Documentation improvements.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-490", "title": "Version 4.9.0", "text": "
                                                                                                                                                                                  • Drop support of Python 3.6.
                                                                                                                                                                                  • Added the HistGBM model.
                                                                                                                                                                                  • Improved print layout for hyperparameter tuning.
                                                                                                                                                                                  • The new available_models method returns an overview of the available predefined models.
                                                                                                                                                                                  • The calibrate and cross_validate methods can no longer be accessed from the trainers.
                                                                                                                                                                                  • The pipeline parameter for the prediction methods is deprecated.
                                                                                                                                                                                  • Improved visualization of the plot_rfecv, plot_successive_halving and plot_learning_curve methods.
                                                                                                                                                                                  • Sparse matrices are now accepted as input.
                                                                                                                                                                                  • Duplicate BO calls are no longer calculated.
                                                                                                                                                                                  • Improvement in performance of the RNN model.
                                                                                                                                                                                  • Refactor of the model's bo attribute.
                                                                                                                                                                                  • Predefined hyperparameters have been updated to be consistent with sklearn's API.
                                                                                                                                                                                  • Fixed a bug where custom scalers were ignored by the models.
                                                                                                                                                                                  • Fixed a bug where the BO of certain models would crash with custom hyperparameters.
                                                                                                                                                                                  • Fixed a bug where duplicate column names could be generated from a custom transformer.
                                                                                                                                                                                  • Documentation improvements.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-480", "title": "Version 4.8.0", "text": "
                                                                                                                                                                                  • The Encoder class now directly handles unknown categories encountered during fitting.
                                                                                                                                                                                  • The Balancerand Encoder classes now accept custom estimators for the strategy parameter.
                                                                                                                                                                                  • The new merge method enables the user to merge multiple atom instances into one.
                                                                                                                                                                                  • The dtype shrinking is moved from atom's initializers to the shrink method.
                                                                                                                                                                                  • ATOM's custom pipeline now handles transformers fitted on a subset of the dataset.
                                                                                                                                                                                  • The column parameter in the distribution method is renamed to columns for continuity of the API.
                                                                                                                                                                                  • The mae criterion for the GBM model hyperparameter tuning is deprecated to be consistent with sklearn's API.
                                                                                                                                                                                  • Branches are now case-insensitive.
                                                                                                                                                                                  • Renaming a branch using an existing name now raises an exception.
                                                                                                                                                                                  • Fixed a bug where columns of type category broke the Imputer class.
                                                                                                                                                                                  • Fixed a bug where predictions of the Stacking ensemble crashed for branches with multiple transformers.
                                                                                                                                                                                  • The tables in the documentation now adapt to dark mode.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-473", "title": "Version 4.7.3", "text": "
                                                                                                                                                                                  • Fixed a bug where the conda-forge recipe couldn't install properly.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-472", "title": "Version 4.7.2", "text": "
                                                                                                                                                                                  • Fixed a bug where the pipeline failed for custom transformers that returned sparse matrices.
                                                                                                                                                                                  • Package requirements files are added to the installer.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-471", "title": "Version 4.7.1", "text": "
                                                                                                                                                                                  • Fixed a bug where the pip installer failed.
                                                                                                                                                                                  • Fixed a bug where categorical columns also selected datetime columns.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-470", "title": "Version 4.7.0", "text": "
                                                                                                                                                                                  • Launched our new slack channel!
                                                                                                                                                                                  • The new FeatureExtractor class extracts useful features from datetime columns.
                                                                                                                                                                                  • The new plot_det method plots a binary classifier's detection error tradeoff curve.
                                                                                                                                                                                  • The plot_partial_dependence is able to draw Individual Conditional Expectation (ICE) lines.
                                                                                                                                                                                  • The full traceback of exceptions encountered during training are now saved to the logger.
                                                                                                                                                                                  • ATOMClassifier and ATOMRegressor now convert the dtypes of the input data to the minimal allowed type for memory efficiency.
                                                                                                                                                                                  • The scoring method is renamed to evaluate to clarify its purpose.
                                                                                                                                                                                  • The column parameter in the apply method is renamed to columns for continuity of the API.
                                                                                                                                                                                  • Minor documentation improvements.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-460", "title": "Version 4.6.0", "text": "
                                                                                                                                                                                  • Added the full_train method to retrieve an estimator trained on the complete dataset.
                                                                                                                                                                                  • The score method is now also able to calculate custom metrics on new data.
                                                                                                                                                                                  • Refactor of the Imputer class.
                                                                                                                                                                                  • Refactor of the Encoder class to avoid errors for unknown classes and allow the input of missing values.
                                                                                                                                                                                  • The clean method no longer automatically encodes the target column for regression tasks.
                                                                                                                                                                                  • Creating a branch using a models' acronym as name now raises an exception.
                                                                                                                                                                                  • Fixed a bug where CatBoost failed when early_stopping < 1.
                                                                                                                                                                                  • Fixed a bug where created pipelines had duplicated names.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-450", "title": "Version 4.5.0", "text": "
                                                                                                                                                                                  • Support of NLP pipelines. Read more in the user guide.
                                                                                                                                                                                  • Integration of mlflow to track all models in the pipeline. Read more in the user guide.
                                                                                                                                                                                  • The new Normalizer class transforms features to a more Gaussian-like distribution.
                                                                                                                                                                                  • New cross_validate method to evaluate the robustness of a pipeline using cross_validation.
                                                                                                                                                                                  • New reset method to go back to atom's initial state.
                                                                                                                                                                                  • Added the Dummy model to compare other models with a simple baseline.
                                                                                                                                                                                  • New plot_wordcloud and plot_ngrams methods for text visualization.
                                                                                                                                                                                  • Plots now can return the figure object when display=None.
                                                                                                                                                                                  • The Pruner class can now able to drop outliers based on the selection of multiple strategies.
                                                                                                                                                                                  • The new shuffle parameter in atom's initializer determines whether to shuffle the dataset.
                                                                                                                                                                                  • The trainers no longer require you to specify a model using the models parameter. If left to default, all predefined models for that task are used.
                                                                                                                                                                                  • The apply method now accepts args and kwargs for the function.
                                                                                                                                                                                  • Refactor of the evaluate method.
                                                                                                                                                                                  • Refactor of the export_pipeline method.
                                                                                                                                                                                  • The parameters in the Cleaner class have been refactored to better describe their function.
                                                                                                                                                                                  • The train_sizes parameter in train_sizing now accepts integer values to automatically create equally distributed splits in the training set.
                                                                                                                                                                                  • Refactor of plot_pipeline to show models in the diagram as well.
                                                                                                                                                                                  • Refactor of the bagging parameter to the (more appropriate) name n_bootstrap.
                                                                                                                                                                                  • New option to exclude columns from a transformer adding ! before their name.
                                                                                                                                                                                  • Fixed a bug where the Pruner class failed if there were categorical columns in the dataset.
                                                                                                                                                                                  • Completely reworked documentation website.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-440", "title": "Version 4.4.0", "text": "
                                                                                                                                                                                  • New apply method to perform data transformations as function to the pipeline
                                                                                                                                                                                  • Added the status method to save an overview of atom's branches and models to the logger.
                                                                                                                                                                                  • Improved the output messages for the Imputer class.
                                                                                                                                                                                  • The dataset's columns can now be called directly from atom.
                                                                                                                                                                                  • The distribution and plot_distribution methods now ignore missing values.
                                                                                                                                                                                  • Fixed a bug where transformations could fail when columns were added to the dataset after initializing the pipeline.
                                                                                                                                                                                  • Fixed a bug where the Cleaner class didn't drop columns consisting entirely of missing values when drop_min_cardinality=True.
                                                                                                                                                                                  • Fixed a bug where the winning model wasn't displayed correctly.
                                                                                                                                                                                  • Refactored the way transformers are added or removed from predicting methods.
                                                                                                                                                                                  • Improved documentation.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-430", "title": "Version 4.3.0", "text": "
                                                                                                                                                                                  • Possibility to add custom transformers to the pipeline.
                                                                                                                                                                                  • The export_pipeline utility method exports atom's current pipeline to a sklearn object.
                                                                                                                                                                                  • New magic methods makes atom behave similarly to sklearn's Pipeline.
                                                                                                                                                                                  • All training approaches can now be combined in the same atom instance.
                                                                                                                                                                                  • New plot_relationships, plot_distribution and plot_qq plots for data inspection.
                                                                                                                                                                                  • Complete rework of all the shap plots to be consistent with their new API.
                                                                                                                                                                                  • Improvements for the Scaler and [Pruner]([] classes.
                                                                                                                                                                                  • The acronym for custom models now defaults to the capital letters in the class' __name__.
                                                                                                                                                                                  • Possibility to apply transformations on only a subset of the columns.
                                                                                                                                                                                  • Plots and methods now accept winner as model name.
                                                                                                                                                                                  • Fixed a bug where custom metrics didn't show the correct name.
                                                                                                                                                                                  • Fixed a bug where timers were not displayed correctly.
                                                                                                                                                                                  • Further compatibility with deep learning datasets.
                                                                                                                                                                                  • Large refactoring for performance optimization.
                                                                                                                                                                                  • Cleaner output of messages to the logger.
                                                                                                                                                                                  • Plots no longer show a default title.
                                                                                                                                                                                  • Minor bug fixes.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-421", "title": "Version 4.2.1", "text": "
                                                                                                                                                                                  • Bug fix where there was memory leakage in successive halving and train sizing pipelines.
                                                                                                                                                                                  • Improved documentation.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-420", "title": "Version 4.2.0", "text": "
                                                                                                                                                                                  • Possibility to add custom models to the pipeline using ATOMModel.
                                                                                                                                                                                  • Compatibility with deep learning models.
                                                                                                                                                                                  • New branch system for different data pipelines. Read more in the user guide.
                                                                                                                                                                                  • Use the canvas contextmanager to draw multiple plots in one figure.
                                                                                                                                                                                  • New voting and stacking ensemble techniques.
                                                                                                                                                                                  • New get_class_weight utility method.
                                                                                                                                                                                  • New Sequential Feature Selection strategy for the FeatureSelector.
                                                                                                                                                                                  • Added the sample_weight parameter to the score method.
                                                                                                                                                                                  • New ways to initialize the data in the training instances.
                                                                                                                                                                                  • The test_size parameter now also allows integer values.
                                                                                                                                                                                  • Renamed categories to classes to be consistent with sklearn's API.
                                                                                                                                                                                  • The class property now returns a pd.DataFrame of the number of rows per target class in the train, test and complete dataset.
                                                                                                                                                                                  • Possibility to add custom parameters to an estimator's fit method through est_params.
                                                                                                                                                                                  • The successive halving and train sizing approaches now both allow subsequent runs from atom without losing the information from previous runs.
                                                                                                                                                                                  • Bug fix where ATOMLoader wouldn't encode the target column during transformation.
                                                                                                                                                                                  • Added the Deep learning, Ensembles and Utilities example notebooks.
                                                                                                                                                                                  • Support for python 3.9.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-410", "title": "Version 4.1.0", "text": "
                                                                                                                                                                                  • New est_params parameter to customize the parameters in every model's estimator.
                                                                                                                                                                                  • Following skopt's API, the n_random_starts parameter to specify the number of random trials is deprecated in favour of n_initial_points.
                                                                                                                                                                                  • The Balancer class now allows you to use any of the strategies from imblearn.
                                                                                                                                                                                  • New utility attributes to inspect the dataset.
                                                                                                                                                                                  • Four new models: CatNB, CNB, ARD and RNN.
                                                                                                                                                                                  • Added the models section to the documentation.
                                                                                                                                                                                  • Small changes in log outputs.
                                                                                                                                                                                  • Bug fixes and performance improvements.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-401", "title": "Version 4.0.1", "text": "
                                                                                                                                                                                  • Bug fix where the FeatureGenerator was not deterministic for a fixed random state.
                                                                                                                                                                                  • Bug fix where subsequent runs with the same metric failed.
                                                                                                                                                                                  • Added the license file to the package's installer.
                                                                                                                                                                                  • Typo fixes in documentation.
                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-400", "title": "Version 4.0.0", "text": "
                                                                                                                                                                                  • Bayesian optimization package changed from GpyOpt to skopt.
                                                                                                                                                                                  • Complete revision of the model's hyperparameters.
                                                                                                                                                                                  • Four SHAP plots can now be called directly from an ATOM pipeline.
                                                                                                                                                                                  • Two new plots for regression tasks.
                                                                                                                                                                                  • New plot_pipeline and pipeline attribute to access all transformers.
                                                                                                                                                                                  • Possibility to determine transformer parameters per method.
                                                                                                                                                                                  • New calibrate and plot_calibration methods.
                                                                                                                                                                                  • Metrics can now be added as scorers or functions with signature metric(y, y_pred, **kwargs).
                                                                                                                                                                                  • Implementation of multi-metric runs.
                                                                                                                                                                                  • Possibility to choose which metric to plot.
                                                                                                                                                                                  • Early stopping for models that allow in-training validation.
                                                                                                                                                                                  • Added the ATOMLoader function to load any saved pickle instance.
                                                                                                                                                                                  • The \"remove\" strategy in the data cleaning parameters is deprecated in favour of \"drop\".
                                                                                                                                                                                  • Implemented the dfs strategy in FeatureGenerator.
                                                                                                                                                                                  • All training classes now inherit from BaseEstimator.
                                                                                                                                                                                  • Added multiple new example notebooks.
                                                                                                                                                                                  • Tests coverage up to 100%.
                                                                                                                                                                                  • Completely new documentation page.
                                                                                                                                                                                  • Bug fixes and performance improvements.
                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v5.x.x/#version-600", "title": "Version 6.0.0", "text": "

                                                                                                                                                                                  New features

                                                                                                                                                                                  • Completely new module for time series. Read more in the user guide.
                                                                                                                                                                                  • Support for Python 3.11 and drop support for Python 3.8 and Python 3.9.
                                                                                                                                                                                  • New data engines. Read more in the user guide.
                                                                                                                                                                                  • Improved memory optimizations. Read more in the user guide.
                                                                                                                                                                                  • Added the iterative strategy for numerical imputation.
                                                                                                                                                                                  • New update_traces method to further customize your plots.

                                                                                                                                                                                  API changes

                                                                                                                                                                                  • The FeatureGrouper class no longer accepts a name parameter. Provide the group names directly through the group parameter as dict.
                                                                                                                                                                                  • Rework of the register method.
                                                                                                                                                                                  • The multioutput attribute is deprecated. Multioutput meta-estimators are now assigned automatically.
                                                                                                                                                                                  • Model tags have to be separated from the acronym by an underscore.
                                                                                                                                                                                  • The engine parameter is now a dict.
                                                                                                                                                                                  • The automl method is deprecated.

                                                                                                                                                                                  Enhancements

                                                                                                                                                                                  • Transformations only on y are now accepted, e.g., atom.scale(columns=-1).
                                                                                                                                                                                  • Full support for pandas nullable dtypes.
                                                                                                                                                                                  • The dataset can now be provided as callable.
                                                                                                                                                                                  • The save and save_data methods now accept pathlib.Path objects as filename.
                                                                                                                                                                                  • Cleaner representation on hover for the plot_timeline method.
                                                                                                                                                                                  • Added the hdbscan strategy to the Pruner class.
                                                                                                                                                                                  • The cv key in ht_params now accepts a custom cross-validation generator.
                                                                                                                                                                                  • Improved error message for incorrect stratification of multioutput datasets.
                                                                                                                                                                                  • Rework of the shrink method.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed a bug where the cross_validate method could fail for pipelines that changed the number of rows.
                                                                                                                                                                                  • Fixed a bug where the Pruner class didn't drop all outlier clusters.
                                                                                                                                                                                  • Fixed a bug where the pipeline could fail for transformers that returned a series.
                                                                                                                                                                                  • Fixed a bug where the pipeline could fail for transformers that reset its internal attributes during fitting.
                                                                                                                                                                                  • Fixed a bug where the register method failed in Databricks.
                                                                                                                                                                                  • Fixed a bug where tuning hyperparameter for a base_estimator inside a custom meta-estimator would fail.
                                                                                                                                                                                  • Fixed a bug where the data properties' @setter could fail for numpy arrays.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-520", "title": "Version 5.2.0", "text": "

                                                                                                                                                                                  New features

                                                                                                                                                                                  • Two new plot methods: plot_terminator_improvement and plot_timeline.

                                                                                                                                                                                  Enhancements

                                                                                                                                                                                  • Data splits in every trial are now properly stratified according to the selected strategy.
                                                                                                                                                                                  • Performance optimization for multiple methods using smart caching.
                                                                                                                                                                                  • Improved visualizations for plots with logarithmic hyperparameters.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed a bug where parameters in a trial would not match with those displayed.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-512", "title": "Version 5.1.2", "text": "

                                                                                                                                                                                  API changes

                                                                                                                                                                                  • The default strategy for the encode method has changed from \"LeaveOneOut\" to \"Target\"-encoding. LeaveOneOut is no longer a supported strategy.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed a bug where stratification failed for datasets where the target column was not placed last.
                                                                                                                                                                                  • Fixed a bug where transformers with no get_feature_names_out method could fail.
                                                                                                                                                                                  • Fixed a bug where the FeatureSelector class could fail when transforming a dataset with different column order than seen at fit time.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-511", "title": "Version 5.1.1", "text": "

                                                                                                                                                                                  API changes

                                                                                                                                                                                  • The infrequent_to_value parameter in the Encoder class is replaced with infrequent_to_value to be consistent with sklearn's naming convention.

                                                                                                                                                                                  Enhancements

                                                                                                                                                                                  • Added the kwargs parameter to the save_data method.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed an installation issue for systems without an x86 architecture.
                                                                                                                                                                                  • Fixed a bug where Voting would fail for certain metrics.
                                                                                                                                                                                  • Fixed a bug where the time metric in mlflow was always zero.
                                                                                                                                                                                  • Fixed a bug where shap plots wouldn't display the full column names.
                                                                                                                                                                                  • Fixed a bug where column names where not properly propagated during transformation.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-510", "title": "Version 5.1.0", "text": "

                                                                                                                                                                                  New features

                                                                                                                                                                                  • Support for multilabel classification, multiclass-multilabel classification and multioutput regression tasks. Read more in the user guide.
                                                                                                                                                                                  • New backend parameter to choose a parallel execution backend.
                                                                                                                                                                                  • New parallel parameter to train multiple models simultaneously.
                                                                                                                                                                                  • Integration with DAGsHub to store your mlflow experiments. Read more in the user guide.
                                                                                                                                                                                  • New serve method to deploy models to a rest API endpoint.
                                                                                                                                                                                  • New get_best_threshold method to calculate the optimal threshold for binary and multilabel tasks.
                                                                                                                                                                                  • New get_sample_weight method to calculate the sample weights for a balanced data set.

                                                                                                                                                                                  API changes

                                                                                                                                                                                  • The ATOMLoader class is deprecated in favor of the load method.
                                                                                                                                                                                  • The errors attribute for runners is deprecated.

                                                                                                                                                                                  Enhancements

                                                                                                                                                                                  • Added three new notebook examples.
                                                                                                                                                                                  • Added the drop_chars parameter to the Cleaner class.
                                                                                                                                                                                  • Added the errors parameter to the trainers.
                                                                                                                                                                                  • Rework of the dependencies, making the base package more lightweight.
                                                                                                                                                                                  • The logging entries for external libraries are redirected to atom's file handler.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed multiple errors that appeared after sklearn's 1.2 update.
                                                                                                                                                                                  • Fixed a bug where hyperparameter tuning could fail for multi-metric runs.
                                                                                                                                                                                  • Fixed a bug where trials would try to report multiple times the same step.
                                                                                                                                                                                  • Fixed a bug where custom models could skip in-training validation.
                                                                                                                                                                                  • Fixed an issue where the bootstrapping estimators were trained using partial_fit.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-501", "title": "Version 5.0.1", "text": "

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • Fixed installation issue.
                                                                                                                                                                                  • Updated package dependencies.

                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-500", "title": "Version 5.0.0", "text": "

                                                                                                                                                                                  New features

                                                                                                                                                                                  • Completely new hyperparameter tuning process.
                                                                                                                                                                                  • Completely reworked plotting interface.
                                                                                                                                                                                  • Accelerate your pipelines with sklearnex.
                                                                                                                                                                                  • New FeatureGrouper class to extract statistical features from similar groups.
                                                                                                                                                                                  • New create_app method to create a nice front-end for model predictions.
                                                                                                                                                                                  • New inverse_transform method for atom and models.
                                                                                                                                                                                  • New linear model: OrthogonalMatchingPursuit.
                                                                                                                                                                                  • The plot_results method now accepts time metrics.

                                                                                                                                                                                  API changes

                                                                                                                                                                                  • The gpu parameter is deprecated in favor of device and engine.
                                                                                                                                                                                  • Refactor of the Cleaner, Discretizer, Encoder and FeatureSelector classes.
                                                                                                                                                                                  • Refactor of all shap plots.
                                                                                                                                                                                  • Refactor of the apply method.
                                                                                                                                                                                  • The plot_scatter_matrix method is renamed to plot_relationships.
                                                                                                                                                                                  • The kSVM model is renamed to SVM.
                                                                                                                                                                                  • Multidimensional datasets are no longer supported. Check the deep learning section of the user guide for guidance with such datasets.
                                                                                                                                                                                  • The greater_is_better, needs_proba and needs_threshold parameters are deprecated. Metric functions are now created using make_scorer's default parameters.
                                                                                                                                                                                  • The drop method is removed from atom. Use the reworked apply method instead.
                                                                                                                                                                                  • The prediction methods can no longer be called from atom.
                                                                                                                                                                                  • The dashboard method for models is now called create_dashboard.

                                                                                                                                                                                  Enhancements

                                                                                                                                                                                  • New examples for plotting, automated feature scaling, pruning and advanced hyperparameter tuning.
                                                                                                                                                                                  • The Normalizer class can now be accelerated with GPU.
                                                                                                                                                                                  • The Scaler class now ignores binary columns (only 0s and 1s).
                                                                                                                                                                                  • The models parameter in plot and utility methods now accepts model indices.
                                                                                                                                                                                  • The transform method now also transforms only y when X has a default value.
                                                                                                                                                                                  • The prediction methods now return pandas objects.
                                                                                                                                                                                  • Dependency versions are checked with originals after unpickling.
                                                                                                                                                                                  • Automatic generation of documentation from docstrings.
                                                                                                                                                                                  • Improvements in documentation display for mobile phones.
                                                                                                                                                                                  • New feature_importance attribute for models.
                                                                                                                                                                                  • Added a visualization for automated feature scaling to plot_pipeline.

                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                  • The FeatureExtractor class no longer raises a warning for highly fragmented dataframes.
                                                                                                                                                                                  • Fixed a bug where models could not call the score function.
                                                                                                                                                                                  • The Encoder class no longer fails when the user provides ordinal values that are not present during fitting.
                                                                                                                                                                                  • Fixed a bug with the max_nan_rows parameter in the Imputer class.
                                                                                                                                                                                  • Fixed a bug where Tokenizer could fail when no ngrams were found.
                                                                                                                                                                                  "}, {"location": "examples/accelerating_cuml/", "title": "Accelerating cuml", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n\n# Create a dummy dataset\nX, y = make_classification(n_samples=100000, n_features=40)\n
                                                                                                                                                                                  from atom import ATOMClassifier from sklearn.datasets import make_classification # Create a dummy dataset X, y = make_classification(n_samples=100000, n_features=40) In\u00a0[2]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\nAlgorithm task: binary classification.\nGPU training enabled.\nBackend engine: cuml.\n\nDataset stats ==================== >>\nShape: (100000, 41)\nMemory: 32.80 MB\nScaled: True\nOutlier values: 8127 (0.2%)\n-------------------------------------\nTrain set size: 80000\nTest set size: 20000\n-------------------------------------\n|   |       dataset |         train |          test |\n| - | ------------- | ------------- | ------------- |\n| 0 |   50006 (1.0) |   40005 (1.0) |   10001 (1.0) |\n| 1 |   49994 (1.0) |   39995 (1.0) |    9999 (1.0) |\n\n
                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom.scale()\n
                                                                                                                                                                                  atom.scale()
                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                  atom.dataset\n
                                                                                                                                                                                  atom.dataset Out[13]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x31 x32 x33 x34 x35 x36 x37 x38 x39 target 0 2.021646 -0.634557 -0.867811 1.103642 1.559011 0.122284 -0.864821 1.411657 0.147997 -2.269082 ... -0.489864 1.861048 -0.353861 0.720823 -1.522117 -0.737707 -1.573936 -0.832174 0.203154 0 1 -0.019885 0.846568 -0.364059 -1.091604 -1.336692 0.186689 -0.274142 0.020563 0.693235 -1.908658 ... -1.610058 -0.365231 0.284908 0.170156 -0.236553 -0.573761 -0.107317 -2.480178 0.420341 0 2 0.516618 -0.013420 -0.753879 -0.488243 0.560051 0.395817 -0.522523 -1.083503 -0.073398 0.383061 ... 0.966283 1.405546 -0.658654 0.339090 -1.615997 -1.312444 0.984578 0.602858 -1.110684 1 3 0.111861 -0.966334 0.208509 0.494328 -0.766835 -0.003399 -0.500449 -0.530622 -0.481663 -1.146132 ... -0.304896 2.030211 -1.189488 -1.238600 1.658765 -0.255644 0.572194 0.195496 0.617734 1 4 0.160135 -0.873517 0.719142 -2.020767 0.421435 -1.941230 0.835615 -1.178845 0.235273 -0.328574 ... 1.633662 -0.631118 1.814046 1.031754 0.328665 1.704483 2.153710 -1.430552 -0.543915 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 1.100240 0.092581 -0.346265 0.234024 0.590199 0.755019 -1.688456 -1.031070 -0.620193 -0.283336 ... 0.356480 1.346821 -0.299087 2.343587 -2.003646 -0.933179 0.764255 -0.233526 -1.462311 1 99996 -1.142596 0.321843 -0.974006 0.390418 0.404722 -0.324256 -0.288176 1.009458 0.860912 -0.191313 ... 0.044618 -2.030135 1.448640 -0.854798 1.441451 1.347461 -0.937607 0.572504 -0.787673 0 99997 1.658252 0.303637 -0.020324 0.225917 0.154092 -1.208507 -0.199919 1.063016 -0.395696 -0.060886 ... 1.563345 -1.261853 -0.810122 -0.503823 1.565602 -1.264792 -0.591644 1.588397 0.601721 0 99998 -0.288042 -1.139792 1.548338 0.501413 0.361604 -0.315720 -0.564607 1.500870 0.501768 0.649079 ... 0.344663 1.734476 0.660177 0.767554 1.461940 0.310189 -1.469978 0.900132 1.114330 0 99999 -3.093351 -0.636463 -0.449575 1.169980 -1.041870 -0.257173 2.072777 -0.101111 -0.956916 -0.251162 ... 2.250647 0.746250 -0.610311 0.445467 -0.636288 -0.187444 0.226108 -0.186927 -1.024960 1

                                                                                                                                                                                  100000 rows \u00d7 41 columns

                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  print(f\"Scaler used: {atom.standard}\")\nprint(f\"Scaler's module: {atom.standard.__class__.__module__}\")\n
                                                                                                                                                                                  print(f\"Scaler used: {atom.standard}\") print(f\"Scaler's module: {atom.standard.__class__.__module__}\")
                                                                                                                                                                                  Scaler used: StandardScaler()\nScaler's module: cuml._thirdparty.sklearn.preprocessing._data\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  atom.run(models=[\"RF\", \"SGD\", \"XGB\"])\n
                                                                                                                                                                                  atom.run(models=[\"RF\", \"SGD\", \"XGB\"])
                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF, SGD, XGB\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9726\nTest evaluation --> f1: 0.9431\nTime elapsed: 1.935s\n-------------------------------------------------\nTotal time: 1.935s\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9236\nTest evaluation --> f1: 0.9219\nTime elapsed: 02m:16s\n-------------------------------------------------\nTotal time: 02m:16s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9749\nTest evaluation --> f1: 0.9437\nTime elapsed: 6.394s\n-------------------------------------------------\nTotal time: 6.394s\n\n\nFinal results ==================== >>\nTotal time: 02m:24s\n-------------------------------------\nRandomForest              --> f1: 0.9431\nStochasticGradientDescent --> f1: 0.9219\nXGBoost                   --> f1: 0.9437 !\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                  atom.results Out[6]: score_train score_test time_fit time RF 0.9726 0.9431 1.934512 1.934512 SGD 0.9236 0.9219 135.871493 135.871493 XGB 0.9749 0.9437 6.394416 6.394416 In\u00a0[7]: Copied!
                                                                                                                                                                                  for m in atom.models:\n    print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")\n
                                                                                                                                                                                  for m in atom.models: print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")
                                                                                                                                                                                  RF's module: cuml.ensemble.randomforestclassifier\nSGD's module: sklearn.linear_model._stochastic_gradient\nXGB's module: xgboost.sklearn\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                  atom.evaluate() Out[8]: accuracy average_precision balanced_accuracy f1 jaccard matthews_corrcoef precision recall roc_auc RF 0.9429 0.9741 0.9429 0.9431 0.8924 0.8858 0.9391 0.9472 0.9792 SGD 0.9217 0.9635 0.9218 0.9219 0.8551 0.8435 0.9203 0.9235 0.9676 XGB 0.9434 0.9753 0.9434 0.9437 0.8933 0.8868 0.9385 0.9489 0.9798"}, {"location": "examples/accelerating_cuml/#example-accelerating-pipelines-on-gpu", "title": "Example: Accelerating pipelines on GPU\u00b6", "text": "

                                                                                                                                                                                  This example shows how to accelerate a pipeline on GPU using cuML.

                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_classification function.

                                                                                                                                                                                  "}, {"location": "examples/accelerating_sklearnex/", "title": "Accelerating sklearnex", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Impute missing values and encode categorical columns\natom.impute()\natom.encode()\n
                                                                                                                                                                                  # Impute missing values and encode categorical columns atom.impute() atom.encode()
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 637 samples due to missing values in feature MinTemp.\n --> Dropping 322 samples due to missing values in feature MaxTemp.\n --> Dropping 1406 samples due to missing values in feature Rainfall.\n --> Dropping 60843 samples due to missing values in feature Evaporation.\n --> Dropping 67816 samples due to missing values in feature Sunshine.\n --> Dropping 9330 samples due to missing values in feature WindGustDir.\n --> Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --> Dropping 10013 samples due to missing values in feature WindDir9am.\n --> Dropping 3778 samples due to missing values in feature WindDir3pm.\n --> Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --> Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 1774 samples due to missing values in feature Humidity9am.\n --> Dropping 3610 samples due to missing values in feature Humidity3pm.\n --> Dropping 14014 samples due to missing values in feature Pressure9am.\n --> Dropping 13981 samples due to missing values in feature Pressure3pm.\n --> Dropping 53657 samples due to missing values in feature Cloud9am.\n --> Dropping 57094 samples due to missing values in feature Cloud3pm.\n --> Dropping 904 samples due to missing values in feature Temp9am.\n --> Dropping 2726 samples due to missing values in feature Temp3pm.\n --> Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Train a K-Nearest Neighbors model (using default sklearn)\natom.run(models=\"KNN\", metric=\"f1\")\n
                                                                                                                                                                                  # Train a K-Nearest Neighbors model (using default sklearn) atom.run(models=\"KNN\", metric=\"f1\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7135\nTest evaluation --> f1: 0.5904\nTime elapsed: 4.239s\n-------------------------------------------------\nTime: 4.239s\n\n\nFinal results ==================== >>\nTotal time: 8.264s\n-------------------------------------\nKNearestNeighbors --> f1: 0.5904\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Now, we train an accelerated KNN using engine=\"sklearnex\"\n# Note the diffrence in training speed!!\natom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})\n
                                                                                                                                                                                  # Now, we train an accelerated KNN using engine=\"sklearnex\" # Note the diffrence in training speed!! atom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})
                                                                                                                                                                                  \nTraining ========================= >>\nModels: KNN_acc\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7135\nTest evaluation --> f1: 0.5904\nTime elapsed: 1.185s\n-------------------------------------------------\nTime: 1.185s\n\n\nFinal results ==================== >>\nTotal time: 2.226s\n-------------------------------------\nKNearestNeighbors --> f1: 0.5904\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                  atom.results Out[8]: f1_train f1_test time_fit time KNN 0.7135 0.5904 4.238729 4.238729 KNN_acc 0.7135 0.5904 1.184578 1.184578 In\u00a0[9]: Copied!
                                                                                                                                                                                  # Note how the underlying estimators might look the same...\nprint(atom.knn.estimator)\nprint(atom.knn_acc.estimator)\n\n# ... but are using different implementations\nprint(atom.knn.estimator.__module__)\nprint(atom.knn_acc.estimator.__module__)\n
                                                                                                                                                                                  # Note how the underlying estimators might look the same... print(atom.knn.estimator) print(atom.knn_acc.estimator) # ... but are using different implementations print(atom.knn.estimator.__module__) print(atom.knn_acc.estimator.__module__)
                                                                                                                                                                                  KNeighborsClassifier(n_jobs=1)\nKNeighborsClassifier(n_jobs=1)\nsklearn.neighbors._classification\nsklearnex.neighbors.knn_classification\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):\n    atom.plot_results(metric=\"time_fit\", title=\"Training\")\n    atom.plot_results(metric=\"time\", title=\"Total\")\n
                                                                                                                                                                                  with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"): atom.plot_results(metric=\"time_fit\", title=\"Training\") atom.plot_results(metric=\"time\", title=\"Total\")"}, {"location": "examples/accelerating_sklearnex/#example-accelerating-pipelines", "title": "Example: Accelerating pipelines\u00b6", "text": "

                                                                                                                                                                                  This example shows how to accelerate your models on cpu using sklearnex.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/accelerating_sklearnex/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/advanced_plotting/", "title": "Advanced plotting", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1)\natom.impute()\natom.encode()\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1) atom.impute() atom.encode()
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Let's see how the default aesthetics looks like\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  # Let's see how the default aesthetics looks like atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[5]: Copied!
                                                                                                                                                                                  # Change the color palette using color names or their hex codes\natom.palette = [\"red\", \"#00f\"]\n
                                                                                                                                                                                  # Change the color palette using color names or their hex codes atom.palette = [\"red\", \"#00f\"] In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[7]: Copied!
                                                                                                                                                                                  # Change the title and label fontsize\natom.title_fontsize = 30\natom.label_fontsize = 24\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  # Change the title and label fontsize atom.title_fontsize = 30 atom.label_fontsize = 24 atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[8]: Copied!
                                                                                                                                                                                  # Use the update_layout method to change layout properties\natom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\")\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  # Use the update_layout method to change layout properties atom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\") atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[9]: Copied!
                                                                                                                                                                                  # Use the update_traces method to change the trace (note the y-axis)\natom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\"))\natom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  # Use the update_traces method to change the trace (note the y-axis) atom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\")) atom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\") In\u00a0[10]: Copied!
                                                                                                                                                                                  # Let's go back to the default aesthetics\natom.reset_aesthetics()\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                  # Let's go back to the default aesthetics atom.reset_aesthetics() atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[11]: Copied!
                                                                                                                                                                                  # And update the title with some custom fonts\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    )\n)\n
                                                                                                                                                                                  # And update the title with some custom fonts atom.plot_distribution( columns=[1, 2], title=dict( text=\"Distribution of temperatures\", font_color=\"teal\", x=0, xanchor=\"left\", ) ) In\u00a0[12]: Copied!
                                                                                                                                                                                  # We can update the legend in a similar fashion\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    ),\n    legend=dict(title=\"Legend's title\"),\n)\n
                                                                                                                                                                                  # We can update the legend in a similar fashion atom.plot_distribution( columns=[1, 2], title=dict( text=\"Distribution of temperatures\", font_color=\"teal\", x=0, xanchor=\"left\", ), legend=dict(title=\"Legend's title\"), ) In\u00a0[13]: Copied!
                                                                                                                                                                                  atom.run(\"LR\")\n\n# You can plot the ROC curve for a selection of rows,\n# for example, for rows in a specific location\natom.plot_roc(\n    rows={\n        \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],\n        \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],\n    }\n)\n
                                                                                                                                                                                  atom.run(\"LR\") # You can plot the ROC curve for a selection of rows, # for example, for rows in a specific location atom.plot_roc( rows={ \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"], \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"], } )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6247\nTest evaluation --> f1: 0.6093\nTime elapsed: 0.636s\n-------------------------------------------------\nTime: 0.636s\n\n\nFinal results ==================== >>\nTotal time: 1.044s\n-------------------------------------\nLogisticRegression --> f1: 0.6093\n
                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  # Note how the same column over different plots is grouped\nwith atom.canvas(2, 2):\n    atom.plot_distribution(columns=1)\n    atom.plot_distribution(columns=2)\n    atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])\n    atom.plot_qq(columns=[1, 2])\n
                                                                                                                                                                                  # Note how the same column over different plots is grouped with atom.canvas(2, 2): atom.plot_distribution(columns=1) atom.plot_distribution(columns=2) atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"]) atom.plot_qq(columns=[1, 2])"}, {"location": "examples/advanced_plotting/#example-advanced-plotting", "title": "Example: Advanced plotting\u00b6", "text": "

                                                                                                                                                                                  This example shows how to make the best use of all of atom's plotting options.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/advanced_plotting/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-colors-and-font-size", "title": "Customize colors and font size\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-layout", "title": "Customize the plot's layout\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-traces", "title": "Customize the plot's traces\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-title-and-legend", "title": "Customize the title and legend\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customizing-the-rows-to-plot", "title": "Customizing the rows to plot\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#using-a-canvas", "title": "Using a canvas\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/", "title": "Automated feature scaling", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Check which models require feature scaling\natom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]]\n
                                                                                                                                                                                  # Check which models require feature scaling atom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]] Out[4]: acronym model needs_scaling 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost True 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree False 7 Dummy Dummy False 8 ETree ExtraTree False 9 ET ExtraTrees False 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive True 22 Perc Perceptron True 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest False 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[5]: Copied!
                                                                                                                                                                                  # We fit two models: LR needs scaling and Bag doesn't\natom.run([\"LR\", \"Bag\"])\n
                                                                                                                                                                                  # We fit two models: LR needs scaling and Bag doesn't atom.run([\"LR\", \"Bag\"])
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR, Bag\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9861\nTime elapsed: 0.051s\n-------------------------------------------------\nTime: 0.051s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9982\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.111s\n-------------------------------------------------\nTime: 0.111s\n\n\nFinal results ==================== >>\nTotal time: 0.216s\n-------------------------------------\nLogisticRegression --> f1: 0.9861 !\nBagging            --> f1: 0.9444\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Now, we create a new branch and scale the features before fitting the model\natom.branch = \"scaling\"\n
                                                                                                                                                                                  # Now, we create a new branch and scale the features before fitting the model atom.branch = \"scaling\"
                                                                                                                                                                                  Successfully created new branch: scaling.\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  atom.scale()\n
                                                                                                                                                                                  atom.scale()
                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.run(\"LR_2\")\n
                                                                                                                                                                                  atom.run(\"LR_2\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR_2\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9861\nTime elapsed: 0.035s\n-------------------------------------------------\nTime: 0.035s\n\n\nFinal results ==================== >>\nTotal time: 0.057s\n-------------------------------------\nLogisticRegression --> f1: 0.9861\n
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # Let's compare the differences between the models\nprint(atom.lr.scaler)\nprint(atom.bag.scaler)\nprint(atom.lr_2.scaler)\n
                                                                                                                                                                                  # Let's compare the differences between the models print(atom.lr.scaler) print(atom.bag.scaler) print(atom.lr_2.scaler)
                                                                                                                                                                                  Scaler()\nNone\nNone\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # And the data they use is different\nprint(atom.lr.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.bag.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.lr_2.X_train.equals(atom.lr.X_train))\n
                                                                                                                                                                                  # And the data they use is different print(atom.lr.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.bag.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.lr_2.X_train.equals(atom.lr.X_train))
                                                                                                                                                                                           x0        x1        x2\n0 -0.181875  0.356669 -0.147122\n1  1.162216  0.300578  1.159704\n2  1.056470  1.212060  0.933833\n3  0.277287  2.457753  0.188054\n4 -1.442482 -0.825921 -1.343434\n-----------------------------\n      x0     x1      x2\n0  13.48  20.82   88.40\n1  18.31  20.58  120.80\n2  17.93  24.48  115.20\n3  15.13  29.81   96.71\n4   8.95  15.76   58.74\n-----------------------------\nTrue\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # Note that the scaler is included in the model's pipeline\nprint(atom.lr.pipeline)\nprint(\"-----------------------------\")\nprint(atom.bag.pipeline)\nprint(\"-----------------------------\")\nprint(atom.lr_2.pipeline)\n
                                                                                                                                                                                  # Note that the scaler is included in the model's pipeline print(atom.lr.pipeline) print(\"-----------------------------\") print(atom.bag.pipeline) print(\"-----------------------------\") print(atom.lr_2.pipeline)
                                                                                                                                                                                  Pipeline(memory=Memory(location=None), steps=[('AutomatedScaler', Scaler())])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[('Scaler', Scaler(verbose=2))])\n
                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                  atom.plot_pipeline()\n
                                                                                                                                                                                  atom.plot_pipeline()"}, {"location": "examples/automated_feature_scaling/#example-automated-feature-scaling", "title": "Example: Automated feature scaling\u00b6", "text": "

                                                                                                                                                                                  This example shows how ATOM handles models that require automated feature scaling.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/automated_feature_scaling/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/binary_classification/", "title": "Binary classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  # Call atom using only 5% of the complete dataset (for explanatory purposes)\natom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)\n
                                                                                                                                                                                  # Call atom using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 8 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (7109, 22)\nTrain set size: 5688\nTest set size: 1421\n-------------------------------------\nMemory: 1.25 MB\nScaled: False\nMissing values: 15868 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 1 (0.0%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Impute missing values\natom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)\n
                                                                                                                                                                                  # Impute missing values atom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 7 samples for containing more than 16 missing values.\n --> Imputing 23 missing values with median (11.9) in feature MinTemp.\n --> Imputing 10 missing values with median (22.6) in feature MaxTemp.\n --> Imputing 72 missing values with median (0.0) in feature Rainfall.\n --> Imputing 3059 missing values with median (4.6) in feature Evaporation.\n --> Imputing 3382 missing values with median (8.5) in feature Sunshine.\n --> Dropping 467 samples due to missing values in feature WindGustDir.\n --> Imputing 466 missing values with median (39.0) in feature WindGustSpeed.\n --> Dropping 479 samples due to missing values in feature WindDir9am.\n --> Dropping 165 samples due to missing values in feature WindDir3pm.\n --> Imputing 53 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 115 missing values with median (17.0) in feature WindSpeed3pm.\n --> Imputing 72 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 164 missing values with median (52.0) in feature Humidity3pm.\n --> Imputing 699 missing values with median (1017.7) in feature Pressure9am.\n --> Imputing 699 missing values with median (1015.4) in feature Pressure3pm.\n --> Imputing 2698 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 2903 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 32 missing values with median (16.7) in feature Temp9am.\n --> Imputing 116 missing values with median (21.1) in feature Temp3pm.\n --> Dropping 72 samples due to missing values in feature RainToday.\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Encode the categorical features\natom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)\n
                                                                                                                                                                                  # Encode the categorical features atom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)
                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 47 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Train an Extra-Trees and a Random Forest model\natom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)\n
                                                                                                                                                                                  # Train an Extra-Trees and a Random Forest model atom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)
                                                                                                                                                                                  \nTraining ========================= >>\nModels: ET, RF\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.5956\nTime elapsed: 1.414s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5709 \u00b1 0.0198\nTime elapsed: 1.020s\n-------------------------------------------------\nTime: 2.434s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.6124\nTime elapsed: 0.337s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5802 \u00b1 0.0111\nTime elapsed: 1.281s\n-------------------------------------------------\nTime: 1.618s\n\n\nFinal results ==================== >>\nTotal time: 4.225s\n-------------------------------------\nExtraTrees   --> f1: 0.5709 \u00b1 0.0198 ~\nRandomForest --> f1: 0.5802 \u00b1 0.0111 ~ !\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Let's have a look at the final results\natom.results\n
                                                                                                                                                                                  # Let's have a look at the final results atom.results Out[7]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time ET 0.8503 0.5688 1.414043 0.570892 1.019728 2.433771 RF 0.8552 0.5612 0.336765 0.580178 1.281000 1.617765 In\u00a0[8]: Copied!
                                                                                                                                                                                  # Visualize the bootstrap results\natom.plot_results(title=\"RF vs ET performance\")\n
                                                                                                                                                                                  # Visualize the bootstrap results atom.plot_results(title=\"RF vs ET performance\") In\u00a0[9]: Copied!
                                                                                                                                                                                  # Print the results of some common metrics\natom.evaluate()\n
                                                                                                                                                                                  # Print the results of some common metrics atom.evaluate() Out[9]: accuracy ap ba f1 jaccard mcc precision recall auc ET 0.8478 0.6904 0.7059 0.5688 0.3974 0.5108 0.7750 0.4493 0.8561 RF 0.8405 0.6775 0.7038 0.5612 0.3901 0.4891 0.7283 0.4565 0.8502 In\u00a0[10]: Copied!
                                                                                                                                                                                  # The winner attribute calls the best model (atom.winner == atom.rf)\nprint(f\"The winner is the {atom.winner.name} model!!\")\n
                                                                                                                                                                                  # The winner attribute calls the best model (atom.winner == atom.rf) print(f\"The winner is the {atom.winner.name} model!!\")
                                                                                                                                                                                  The winner is the RF model!!\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # Visualize the distribution of predicted probabilities\natom.winner.plot_probabilities()\n
                                                                                                                                                                                  # Visualize the distribution of predicted probabilities atom.winner.plot_probabilities() In\u00a0[12]: Copied!
                                                                                                                                                                                  # Compare how different metrics perform for different thresholds\natom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)\n
                                                                                                                                                                                  # Compare how different metrics perform for different thresholds atom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)"}, {"location": "examples/binary_classification/#example-binary-classification", "title": "Example: Binary classification\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to solve a binary classification problem. Additonnaly, we'll perform a variety of data cleaning steps to prepare the data for modeling.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/binary_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/binary_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/binary_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/calibration/", "title": "Calibration", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)\n\n# Apply data cleaning steps\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")\natom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)\n\n# Train a linear SVM\natom.run(\"gnb\")\n
                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False) # Apply data cleaning steps atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05) # Train a linear SVM atom.run(\"gnb\")
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (10000, 22)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 1.76 MB\nScaled: False\nMissing values: 22184 (10.1%)\nCategorical features: 5 (23.8%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= >>\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5807\nTest evaluation --> f1: 0.5971\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.094s\n\n\nFinal results ==================== >>\nTotal time: 0.160s\n-------------------------------------\nGaussianNB --> f1: 0.5971\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Check the model's calibration\natom.plot_calibration()\n
                                                                                                                                                                                  # Check the model's calibration atom.plot_calibration() In\u00a0[5]: Copied!
                                                                                                                                                                                  # Let's try to improve it using the calibrate method\natom.winner.calibrate(method=\"isotonic\", cv=5)\n
                                                                                                                                                                                  # Let's try to improve it using the calibrate method atom.winner.calibrate(method=\"isotonic\", cv=5)
                                                                                                                                                                                  Results for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5034\nTest evaluation --> f1: 0.5061\nTime elapsed: 0.282s\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # And check again...\natom.plot_calibration()\n
                                                                                                                                                                                  # And check again... atom.plot_calibration()"}, {"location": "examples/calibration/#example-calibration", "title": "Example: Calibration\u00b6", "text": "

                                                                                                                                                                                  This example shows how to calibrate a classifier through atom.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/calibration/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/calibration/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/calibration/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/deep_learning/", "title": "Deep learning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport absl.logging\nabsl.logging.set_verbosity(absl.logging.ERROR)\n\nfrom atom import ATOMClassifier, ATOMModel\nfrom sklearn.preprocessing import FunctionTransformer\nfrom optuna.pruners import PatientPruner\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\nfrom scikeras.wrappers import KerasClassifier\nfrom keras.datasets import mnist\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Flatten, Conv2D, Dropout\n
                                                                                                                                                                                  # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" from tensorflow import get_logger get_logger().setLevel('ERROR') import absl.logging absl.logging.set_verbosity(absl.logging.ERROR) from atom import ATOMClassifier, ATOMModel from sklearn.preprocessing import FunctionTransformer from optuna.pruners import PatientPruner from optuna.distributions import CategoricalDistribution, IntDistribution from scikeras.wrappers import KerasClassifier from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Flatten, Conv2D, Dropout In\u00a0[2]: Copied!
                                                                                                                                                                                  # Download the MNIST dataset\n(X_train, y_train), (X_test, y_test) = mnist.load_data()\n\n# Flatten data to follow sklearn's API (2d input)\nX_train = X_train.reshape(len(X_train), -1)\nX_test = X_test.reshape(len(X_test), -1)\n\ndata = (X_train, y_train), (X_test, y_test)\n
                                                                                                                                                                                  # Download the MNIST dataset (X_train, y_train), (X_test, y_test) = mnist.load_data() # Flatten data to follow sklearn's API (2d input) X_train = X_train.reshape(len(X_train), -1) X_test = X_test.reshape(len(X_test), -1) data = (X_train, y_train), (X_test, y_test) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Create the convolutional neural network\nclass ConvNN(KerasClassifier):\n    \"\"\"Convolutional neural network model.\"\"\"\n\n    @property\n    def feature_encoder(self):\n        \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"\n        return FunctionTransformer(\n            func=lambda X: X.reshape(X.shape[0], 28, 28, 1),\n        )\n\n    @staticmethod\n    def _keras_build_fn(**kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(\n            Conv2D(\n                filters=8,\n                kernel_size=3,\n                activation=\"relu\",\n                input_shape=(28, 28, 1),\n            )\n        )\n        model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))\n        model.add(Flatten())\n        model.add(Dense(units=10, activation=\"softmax\"))\n        model.compile(\n            optimizer=\"adam\",\n            loss=\"sparse_categorical_crossentropy\",\n        )\n\n        return model\n
                                                                                                                                                                                  # Create the convolutional neural network class ConvNN(KerasClassifier): \"\"\"Convolutional neural network model.\"\"\" @property def feature_encoder(self): \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\" return FunctionTransformer( func=lambda X: X.reshape(X.shape[0], 28, 28, 1), ) @staticmethod def _keras_build_fn(**kwargs): \"\"\"Create the model's architecture.\"\"\" model = Sequential() model.add( Conv2D( filters=8, kernel_size=3, activation=\"relu\", input_shape=(28, 28, 1), ) ) model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\")) model.add(Flatten()) model.add(Dense(units=10, activation=\"softmax\")) model.compile( optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", ) return model In\u00a0[4]: Copied!
                                                                                                                                                                                  # Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=ConvNN(verbose=0),\n    acronym=\"CNN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    has_validation=\"epochs\",  # Applies in-training validation on parameter epochs\n)\n
                                                                                                                                                                                  # Convert the model to an ATOM model model = ATOMModel( estimator=ConvNN(verbose=0), acronym=\"CNN\", needs_scaling=True, # Applies automated feature scaling before fitting has_validation=\"epochs\", # Applies in-training validation on parameter epochs ) In\u00a0[5]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== >>\nShape: (7000, 785)\nTrain set size: 6000\nTest set size: 1000\n-------------------------------------\nMemory: 5.54 MB\nScaled: False\nOutlier values: 41839 (0.9%)\n\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Like any other model, we can define custom distributions for hyperparameter tuning\natom.run(\n    models=model,\n    metric=\"f1_weighted\",\n    n_trials=12,\n    ht_params={\n        \"distributions\": {\n            \"epochs\": IntDistribution(2, 10),\n            \"batch_size\": CategoricalDistribution([128, 256, 512]),\n        },\n    }\n)\n
                                                                                                                                                                                  # Like any other model, we can define custom distributions for hyperparameter tuning atom.run( models=model, metric=\"f1_weighted\", n_trials=12, ht_params={ \"distributions\": { \"epochs\": IntDistribution(2, 10), \"batch_size\": CategoricalDistribution([128, 256, 512]), }, } )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: CNN\nMetric: f1_weighted\n\n\nRunning hyperparameter tuning for ConvNN...\n| trial |  epochs | batch_size | f1_weighted | best_f1_weighted | time_trial | time_ht |    state |\n| ----- | ------- | ---------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |       5 |        128 |      0.9147 |           0.9147 |     9.127s |  9.127s | COMPLETE |\n| 1     |       3 |        512 |      0.8539 |           0.9147 |     4.995s | 14.122s | COMPLETE |\n| 2     |       5 |        512 |      0.8931 |           0.9147 |     7.712s | 21.834s | COMPLETE |\n| 3     |       3 |        128 |       0.901 |           0.9147 |     5.706s | 27.540s | COMPLETE |\n| 4     |       5 |        128 |      0.9147 |           0.9147 |     0.607s | 28.147s | COMPLETE |\n| 5     |       9 |        128 |      0.9251 |           0.9251 |    15.297s | 43.443s | COMPLETE |\n| 6     |       9 |        128 |      0.9251 |           0.9251 |     1.230s | 44.673s | COMPLETE |\n| 7     |       3 |        128 |       0.901 |           0.9251 |     0.636s | 45.309s | COMPLETE |\n| 8     |      10 |        256 |      0.8131 |           0.9251 |     2.573s | 47.882s |   PRUNED |\n| 9     |       8 |        128 |      0.9191 |           0.9251 |    14.014s | 01m:02s |   PRUNED |\n| 10    |       7 |        256 |       0.836 |           0.9251 |     2.498s | 01m:04s |   PRUNED |\n| 11    |      10 |        128 |      0.9431 |           0.9431 |    16.725s | 01m:21s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 11\nBest parameters:\n --> epochs: 10\n --> batch_size: 128\nBest evaluation --> f1_weighted: 0.9431\nTime elapsed: 01m:21s\nFit ---------------------------------------------\nTrain evaluation --> f1_weighted: 0.9835\nTest evaluation --> f1_weighted: 0.952\nTime elapsed: 28.600s\n-------------------------------------------------\nTime: 01m:50s\n\n\nFinal results ==================== >>\nTotal time: 03m:39s\n-------------------------------------\nConvNN --> f1_weighted: 0.952\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  atom.cnn.trials\n
                                                                                                                                                                                  atom.cnn.trials Out[7]: epochs batch_size estimator f1_weighted best_f1_weighted time_trial time_ht state trial 0 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 9.126504 9.126504 COMPLETE 1 3 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.853919 0.943121 4.995052 14.121556 COMPLETE 2 5 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.893128 0.943121 7.712461 21.834017 COMPLETE 3 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 5.705581 27.539598 COMPLETE 4 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 0.607057 28.146655 COMPLETE 5 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 15.296670 43.443325 COMPLETE 6 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 1.229779 44.673104 COMPLETE 7 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 0.635578 45.308682 COMPLETE 8 10 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.813073 0.943121 2.573343 47.882025 PRUNED 9 8 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.919095 0.943121 14.014060 61.896085 PRUNED 10 7 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.835966 0.943121 2.498169 64.394254 PRUNED 11 10 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.943121 0.943121 16.725048 81.119302 COMPLETE In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.plot_evals(dataset=\"test+train\")\n
                                                                                                                                                                                  atom.plot_evals(dataset=\"test+train\") In\u00a0[9]: Copied!
                                                                                                                                                                                  # Use the prediction methods like any other model\natom.cnn.predict_proba(X_train)\n
                                                                                                                                                                                  # Use the prediction methods like any other model atom.cnn.predict_proba(X_train) Out[9]: 0 1 2 3 4 5 6 7 8 9 0 6.981344e-08 1.163047e-08 1.302092e-07 7.298404e-01 4.980663e-11 2.701415e-01 6.764501e-11 1.982446e-06 5.807213e-07 1.532895e-05 1 9.999958e-01 2.160013e-12 2.527803e-06 1.498349e-07 2.094386e-09 4.418725e-07 6.460270e-07 2.255171e-07 2.042284e-08 7.188346e-08 2 1.154879e-10 2.405690e-10 1.185454e-07 3.165163e-07 9.995613e-01 1.887145e-11 6.159564e-12 4.155245e-04 1.546579e-09 2.274483e-05 3 5.565947e-07 9.992028e-01 6.758810e-04 3.334095e-06 2.312364e-05 9.298934e-08 1.309337e-07 7.859311e-05 1.515798e-05 3.681653e-07 4 4.683458e-09 4.092270e-08 3.246872e-07 1.020155e-06 2.804452e-03 9.423515e-08 3.789635e-12 8.406813e-03 7.883451e-05 9.887084e-01 ... ... ... ... ... ... ... ... ... ... ... 59995 7.329114e-09 4.127999e-08 3.695257e-06 1.461548e-04 1.231008e-09 6.157245e-06 2.624072e-11 8.209722e-09 9.998319e-01 1.199038e-05 59996 6.239399e-08 2.397851e-09 1.575265e-03 9.643788e-01 8.514269e-08 1.101398e-04 1.774388e-10 1.135693e-07 3.362476e-02 3.106496e-04 59997 7.059591e-10 5.808693e-09 1.657147e-11 3.829917e-05 3.490374e-07 9.998387e-01 4.054391e-11 4.646493e-11 1.087904e-04 1.385001e-05 59998 1.183419e-05 2.104532e-09 1.940764e-06 1.050059e-07 8.195059e-06 5.124656e-06 9.999721e-01 4.185512e-09 7.723169e-07 1.096977e-09 59999 3.987676e-04 1.140556e-06 4.448286e-04 4.279935e-06 1.410985e-07 2.539659e-03 8.256741e-08 8.921248e-08 9.958331e-01 7.779775e-04

                                                                                                                                                                                  60000 rows \u00d7 10 columns

                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Or make plots...\natom.cnn.plot_hyperparameters()\n
                                                                                                                                                                                  # Or make plots... atom.cnn.plot_hyperparameters() In\u00a0[11]: Copied!
                                                                                                                                                                                  atom.plot_parallel_coordinate()\n
                                                                                                                                                                                  atom.plot_parallel_coordinate()"}, {"location": "examples/deep_learning/#example-deep-learning", "title": "Example: Deep learning\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to train and validate a Convolutional Neural Network implemented with Keras using scikeras.

                                                                                                                                                                                  Import the MNIST dataset from keras.datasets. This is a well known image dataset whose goal is to classify handwritten digits.

                                                                                                                                                                                  "}, {"location": "examples/deep_learning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/deep_learning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/deep_learning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ensembles/", "title": "Ensembles", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n
                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True, as_frame=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom and train several models\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\natom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")\n
                                                                                                                                                                                  # Initialize atom and train several models atom = ATOMClassifier(X, y, verbose=2, random_state=1) atom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n\nTraining ========================= >>\nModels: LR, Tree, LGB\nMetric: accuracy\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 0.989\nTest evaluation --> accuracy: 0.9823\nTime elapsed: 0.048s\n-------------------------------------------------\nTime: 0.048s\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.246s\n-------------------------------------------------\nTime: 0.246s\n\n\nFinal results ==================== >>\nTotal time: 0.419s\n-------------------------------------\nLogisticRegression --> accuracy: 0.9823 !\nDecisionTree       --> accuracy: 0.9469\nLightGBM           --> accuracy: 0.9469\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Combine the models into a Voting model\natom.voting(voting=\"soft\")\n
                                                                                                                                                                                  # Combine the models into a Voting model atom.voting(voting=\"soft\")
                                                                                                                                                                                  Results for Voting:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.055s\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Note that we now have an extra model in the pipeline\natom.models\n
                                                                                                                                                                                  # Note that we now have an extra model in the pipeline atom.models Out[5]:
                                                                                                                                                                                  ['LR', 'Tree', 'LGB', 'Vote']
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # The plot_pipeline method helps us visualize the ensemble\natom.plot_pipeline()\n
                                                                                                                                                                                  # The plot_pipeline method helps us visualize the ensemble atom.plot_pipeline() In\u00a0[7]: Copied!
                                                                                                                                                                                  # The Vote model averages the scores of the models it contains\natom.vote\n
                                                                                                                                                                                  # The Vote model averages the scores of the models it contains atom.vote Out[7]:
                                                                                                                                                                                  Voting()
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # We can use it like any other model to make predictions or plots\natom.vote.predict_proba(range(10))\n
                                                                                                                                                                                  # We can use it like any other model to make predictions or plots atom.vote.predict_proba(range(10)) Out[8]: 0 1 0 0.961516 0.038484 1 0.999968 0.000032 2 0.998743 0.001257 3 0.968071 0.031929 4 0.000014 0.999986 5 0.999991 0.000009 6 0.000019 0.999981 7 0.000015 0.999985 8 0.000026 0.999974 9 0.002627 0.997373 In\u00a0[9]: Copied!
                                                                                                                                                                                  atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"])\n
                                                                                                                                                                                  atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"]) In\u00a0[10]: Copied!
                                                                                                                                                                                  atom.plot_results(legend=None)\n
                                                                                                                                                                                  atom.plot_results(legend=None) In\u00a0[11]: Copied!
                                                                                                                                                                                  atom.delete(\"vote\")\n
                                                                                                                                                                                  atom.delete(\"vote\")
                                                                                                                                                                                  Deleting 1 models...\n --> Model Vote successfully deleted.\n
                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                  # Just like Voting, we can create a Stacking model\natom.stacking(final_estimator=\"LDA\")\n
                                                                                                                                                                                  # Just like Voting, we can create a Stacking model atom.stacking(final_estimator=\"LDA\")
                                                                                                                                                                                  Results for Stacking:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 0.9934\nTest evaluation --> accuracy: 0.9823\nTime elapsed: 0.728s\n
                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                  # The final estimator uses the predictions of the underlying models\natom.stack.head()\n
                                                                                                                                                                                  # The final estimator uses the predictions of the underlying models atom.stack.head() Out[13]: mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension target 0 13.48 20.82 88.40 559.2 0.10160 0.12550 0.10630 0.05439 0.1720 0.06419 ... 26.02 107.30 740.4 0.1610 0.42250 0.5030 0.22580 0.2807 0.10710 0 1 18.31 20.58 120.80 1052.0 0.10680 0.12480 0.15690 0.09451 0.1860 0.05941 ... 26.20 142.20 1493.0 0.1492 0.25360 0.3759 0.15100 0.3074 0.07863 0 2 17.93 24.48 115.20 998.9 0.08855 0.07027 0.05699 0.04744 0.1538 0.05510 ... 34.69 135.10 1320.0 0.1315 0.18060 0.2080 0.11360 0.2504 0.07948 0 3 15.13 29.81 96.71 719.5 0.08320 0.04605 0.04686 0.02739 0.1852 0.05294 ... 36.91 110.10 931.4 0.1148 0.09866 0.1547 0.06575 0.3233 0.06165 0 4 8.95 15.76 58.74 245.2 0.09462 0.12430 0.09263 0.02308 0.1305 0.07163 ... 17.07 63.34 270.0 0.1179 0.18790 0.1544 0.03846 0.1652 0.07722 1

                                                                                                                                                                                  5 rows \u00d7 31 columns

                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  # Again, the model can be used for predictions or plots\natom.stack.predict(X)\n
                                                                                                                                                                                  # Again, the model can be used for predictions or plots atom.stack.predict(X) Out[14]:
                                                                                                                                                                                  0      0\n1      0\n2      0\n3      0\n4      1\n      ..\n564    1\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: int64
                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                  atom.stack.plot_shap_beeswarm(show=10)\n
                                                                                                                                                                                  atom.stack.plot_shap_beeswarm(show=10)
                                                                                                                                                                                  PermutationExplainer explainer: 114it [00:48,  2.01it/s]                                                                                                                                                                                                                                                             \n
                                                                                                                                                                                  "}, {"location": "examples/ensembles/#example-ensembles", "title": "Example: Ensembles\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use atom's ensemble techniques to improve predictions on a dataset combining several models.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/ensembles/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ensembles/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ensembles/#voting", "title": "Voting\u00b6", "text": ""}, {"location": "examples/ensembles/#stacking", "title": "Stacking\u00b6", "text": ""}, {"location": "examples/feature_engineering/", "title": "Feature engineering", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom and apply data cleaning\natom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0)\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\natom.encode(max_onehot=10, infrequent_to_value=0.04)\n
                                                                                                                                                                                  # Initialize atom and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0) atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) atom.encode(max_onehot=10, infrequent_to_value=0.04) In\u00a0[4]: Copied!
                                                                                                                                                                                  atom.verbose = 2  # Increase verbosity to see the output\n\n# Let's see how a LightGBM model performs\natom.run('LGB', metric='auc')\n
                                                                                                                                                                                  atom.verbose = 2 # Increase verbosity to see the output # Let's see how a LightGBM model performs atom.run('LGB', metric='auc')
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9817\nTest evaluation --> auc: 0.8584\nTime elapsed: 0.831s\n-------------------------------------------------\nTime: 0.831s\n\n\nFinal results ==================== >>\nTotal time: 0.963s\n-------------------------------------\nLightGBM --> auc: 0.8584\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Since we are going to compare different datasets,\n# we need to create separate branches\natom.branch = \"dfs\"\n
                                                                                                                                                                                  # Since we are going to compare different datasets, # we need to create separate branches atom.branch = \"dfs\"
                                                                                                                                                                                  Successfully created new branch: dfs.\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Create 50 new features using dfs\natom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])\n
                                                                                                                                                                                  # Create 50 new features using dfs atom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])
                                                                                                                                                                                  Fitting FeatureGenerator...\nGenerating new features...\n --> 50 new features were added.\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # The warnings warn us that some operators created missing values!\n# We can see the columns with missing values using the nans attribute\natom.nans\n
                                                                                                                                                                                  # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the nans attribute atom.nans Out[7]:
                                                                                                                                                                                  Location                       0\nMinTemp                        0\nMaxTemp                        0\nRainfall                       0\nEvaporation                    0\n                              ..\nTemp9am - WindDir3pm           0\nWindDir9am + WindGustSpeed     0\nWindDir9am + WindSpeed3pm      0\nWindGustDir + WindSpeed9am     0\nWindSpeed3pm - WindSpeed9am    0\nLength: 73, dtype: int64
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # Turn off warnings in the future\natom.warnings = False\n\n# Impute the data again to get rid of the missing values\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\n
                                                                                                                                                                                  # Turn off warnings in the future atom.warnings = False # Impute the data again to get rid of the missing values atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Imputing 12 missing values using the KNN imputer in feature NATURAL_LOGARITHM(Temp3pm).\n
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # 50 new features may be to much...\n# Let's check for multicollinearity and use rfecv to reduce the number\natom.feature_selection(\n    strategy=\"rfecv\",\n    solver=\"LGB\",\n    n_features=30,\n    scoring=\"auc\",\n    max_correlation=0.98,\n)\n
                                                                                                                                                                                  # 50 new features may be to much... # Let's check for multicollinearity and use rfecv to reduce the number atom.feature_selection( strategy=\"rfecv\", solver=\"LGB\", n_features=30, scoring=\"auc\", max_correlation=0.98, )
                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> Feature MinTemp was removed due to collinearity with another feature.\n --> Feature MinTemp + RainToday_No was removed due to collinearity with another feature.\n --> Feature MaxTemp was removed due to collinearity with another feature.\n --> Feature MaxTemp + WindDir3pm was removed due to collinearity with another feature.\n --> Feature MaxTemp + WindGustDir was removed due to collinearity with another feature.\n --> Feature Rainfall was removed due to collinearity with another feature.\n --> Feature Rainfall + RainToday_rare was removed due to collinearity with another feature.\n --> Feature Rainfall + WindDir3pm was removed due to collinearity with another feature.\n --> Feature Sunshine was removed due to collinearity with another feature.\n --> Feature Sunshine - WindDir3pm was removed due to collinearity with another feature.\n --> Feature WindGustSpeed was removed due to collinearity with another feature.\n --> Feature WindSpeed9am was removed due to collinearity with another feature.\n --> Feature WindSpeed3pm was removed due to collinearity with another feature.\n --> Feature Humidity9am was removed due to collinearity with another feature.\n --> Feature Humidity3pm was removed due to collinearity with another feature.\n --> Feature NATURAL_LOGARITHM(Pressure3pm) was removed due to collinearity with another feature.\n --> Feature Pressure3pm - RainToday_Yes was removed due to collinearity with another feature.\n --> Feature Cloud9am + RainToday_No was removed due to collinearity with another feature.\n --> Feature Cloud3pm was removed due to collinearity with another feature.\n --> Feature Cloud3pm + Location was removed due to collinearity with another feature.\n --> Feature Temp9am - WindDir3pm was removed due to collinearity with another feature.\n --> Feature Temp3pm was removed due to collinearity with another feature.\n --> Feature Temp3pm - WindDir9am was removed due to collinearity with another feature.\n --> Feature RainToday_rare was removed due to collinearity with another feature.\n --> rfecv selected 38 features from the dataset.\n   --> Dropping feature Location (rank 12).\n   --> Dropping feature Cloud9am (rank 2).\n   --> Dropping feature RainToday_No (rank 10).\n   --> Dropping feature RainToday_Yes (rank 11).\n   --> Dropping feature Location + RainToday_rare (rank 9).\n   --> Dropping feature Location - Pressure9am (rank 4).\n   --> Dropping feature Location - Temp9am (rank 7).\n   --> Dropping feature Location - WindGustDir (rank 8).\n   --> Dropping feature RainToday_No - WindSpeed3pm (rank 3).\n   --> Dropping feature RainToday_rare + Temp3pm (rank 5).\n   --> Dropping feature Rainfall + RainToday_Yes (rank 6).\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # The collinear attribute shows what features were removed due to multicollinearity\natom.collinear_\n
                                                                                                                                                                                  # The collinear attribute shows what features were removed due to multicollinearity atom.collinear_ Out[10]: drop corr_feature corr_value 0 MinTemp MinTemp + RainToday_No, MinTemp + RainToday_Yes 0.9978, 0.9979 1 MinTemp + RainToday_No MinTemp, MinTemp + RainToday_Yes 0.9978, 0.9914 2 MaxTemp MaxTemp + WindDir3pm, MaxTemp + WindDir9am, Ma... 1.0, 1.0, 1.0 3 MaxTemp + WindDir3pm MaxTemp, MaxTemp + WindDir9am, MaxTemp + WindG... 1.0, 1.0, 1.0 4 MaxTemp + WindGustDir MaxTemp, MaxTemp + WindDir3pm, MaxTemp + WindD... 1.0, 1.0, 1.0 5 Rainfall Rainfall + RainToday_Yes, Rainfall + RainToday... 0.999, 0.9999, 1.0 6 Rainfall + RainToday_rare Rainfall, Rainfall + RainToday_Yes, Rainfall +... 0.9999, 0.9989, 0.9999 7 Rainfall + WindDir3pm Rainfall, Rainfall + RainToday_Yes, Rainfall +... 1.0, 0.999, 0.9999 8 Sunshine RainToday_rare + Sunshine, Sunshine - WindDir3pm 0.9994, 0.9998 9 Sunshine - WindDir3pm Sunshine, RainToday_rare + Sunshine 0.9998, 0.9993 10 WindGustSpeed WindDir9am + WindGustSpeed 1.0 11 WindSpeed9am WindGustDir + WindSpeed9am 1.0 12 WindSpeed3pm WindDir9am + WindSpeed3pm 1.0 13 Humidity9am Humidity9am + WindGustDir 1.0 14 Humidity3pm Humidity3pm - Sunshine 0.9937 15 NATURAL_LOGARITHM(Pressure3pm) Pressure3pm, Pressure3pm - RainToday_Yes 1.0, 0.9981 16 Pressure3pm - RainToday_Yes Pressure3pm, NATURAL_LOGARITHM(Pressure3pm) 0.9981, 0.9981 17 Cloud9am + RainToday_No Cloud9am 0.9828 18 Cloud3pm Cloud3pm + Location, Cloud3pm + RainToday_rare 1.0, 0.9991 19 Cloud3pm + Location Cloud3pm, Cloud3pm + RainToday_rare 1.0, 0.9991 20 Temp9am - WindDir3pm Temp9am 1.0 21 Temp3pm RainToday_rare + Temp3pm, Temp3pm - WindDir9am 0.9999, 1.0 22 Temp3pm - WindDir9am Temp3pm, RainToday_rare + Temp3pm 1.0, 0.9999 23 RainToday_rare Location + RainToday_rare 1.0 In\u00a0[11]: Copied!
                                                                                                                                                                                  # After applying rfecv, we can plot the score per number of features\natom.plot_rfecv()\n
                                                                                                                                                                                  # After applying rfecv, we can plot the score per number of features atom.plot_rfecv() In\u00a0[12]: Copied!
                                                                                                                                                                                  # Let's see how the model performs now\n# Add a tag to the model's acronym to not overwrite previous LGB\natom.run(\"LGB_dfs\", errors=\"raise\")\n
                                                                                                                                                                                  # Let's see how the model performs now # Add a tag to the model's acronym to not overwrite previous LGB atom.run(\"LGB_dfs\", errors=\"raise\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB_dfs\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9893\nTest evaluation --> auc: 0.8572\nTime elapsed: 1.045s\n-------------------------------------------------\nTime: 1.045s\n\n\nFinal results ==================== >>\nTotal time: 1.186s\n-------------------------------------\nLightGBM --> auc: 0.8572\n
                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                  # Create another branch for the genetic features\n# Split form master to avoid the dfs features\natom.branch = \"gfg_from_main\"\n
                                                                                                                                                                                  # Create another branch for the genetic features # Split form master to avoid the dfs features atom.branch = \"gfg_from_main\"
                                                                                                                                                                                  Successfully created new branch: gfg.\n
                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  # Create new features using Genetic Programming\natom.feature_generation(strategy='gfg', n_features=20)\n
                                                                                                                                                                                  # Create new features using Genetic Programming atom.feature_generation(strategy='gfg', n_features=20)
                                                                                                                                                                                  Fitting FeatureGenerator...\n    |   Population Average    |             Best Individual              |\n---- ------------------------- ------------------------------------------ ----------\n Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left\n   0     3.08         0.137852        3         0.505879              N/A     18.62s\n   1     3.30         0.332951        6         0.506041              N/A     19.23s\n   2     3.92         0.429317        7         0.525775              N/A     18.31s\n   3     4.64         0.459817        9         0.532823              N/A     16.25s\n   4     6.59         0.475058       11         0.540078              N/A     15.51s\n   5     8.04         0.498345       13          0.54114              N/A     14.56s\n   6     9.80         0.509423       13         0.543911              N/A     13.87s\n   7    10.86         0.513225       15         0.551242              N/A     13.28s\n   8    11.54         0.513973       15         0.554127              N/A     11.99s\n   9    12.21         0.516725       19         0.554172              N/A     11.44s\n  10    13.09         0.520543       17         0.556923              N/A     10.19s\n  11    13.24         0.519283       17         0.556923              N/A      9.07s\n  12    12.74          0.51949       21         0.558114              N/A      7.95s\n  13    13.88         0.521709       21         0.558114              N/A      6.68s\n  14    15.99         0.523381       19         0.558673              N/A      6.12s\n  15    16.74         0.523708       19         0.558673              N/A      7.97s\n  16    16.84         0.524509       19         0.560449              N/A      6.02s\n  17    16.79         0.525061       19         0.560449              N/A      2.26s\n  18    16.77         0.523639       21         0.561281              N/A      1.11s\n  19    17.03         0.524261       23         0.561813              N/A      0.00s\nGenerating new features...\n --> 20 new features were added.\n
                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                  # We can see the feature's fitness and description through the genetic_features attribute\natom.genetic_features_\n
                                                                                                                                                                                  # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features_ Out[16]: name description fitness 0 x23 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 1 x24 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 2 x25 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 3 x26 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 4 x27 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 5 x28 mul(add(Cloud3pm, add(Cloud3pm, mul(add(WindGu... 0.541322 6 x29 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 7 x30 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 8 x31 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.540696 9 x32 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 10 x33 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.540674 11 x34 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 12 x35 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 13 x36 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 14 x37 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 15 x38 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 16 x39 mul(add(WindGustSpeed, add(Humidity3pm, Rainfa... 0.539923 17 x40 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.539923 18 x41 mul(mul(add(Cloud3pm, add(Cloud3pm, mul(Humidi... 0.539923 19 x42 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539909 In\u00a0[17]: Copied!
                                                                                                                                                                                  # Fit the model again\natom.run(\"LGB_gfg\", metric=\"auc\")\n
                                                                                                                                                                                  # Fit the model again atom.run(\"LGB_gfg\", metric=\"auc\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB_gfg\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9857\nTest evaluation --> auc: 0.8558\nTime elapsed: 1.044s\n-------------------------------------------------\nTime: 1.044s\n\n\nFinal results ==================== >>\nTotal time: 1.227s\n-------------------------------------\nLightGBM --> auc: 0.8558\n
                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                  # Visualize the whole pipeline\natom.plot_pipeline()\n
                                                                                                                                                                                  # Visualize the whole pipeline atom.plot_pipeline() In\u00a0[19]: Copied!
                                                                                                                                                                                  # Use atom's plots to compare the three models\natom.plot_roc(rows=\"test+train\")\n
                                                                                                                                                                                  # Use atom's plots to compare the three models atom.plot_roc(rows=\"test+train\") In\u00a0[23]: Copied!
                                                                                                                                                                                  # To compare other plots it might be useful to use a canvas\nwith atom.canvas(1, 2, figsize=(1800, 800)):\n    atom.lgb_dfs.plot_roc(rows=\"test+train\")\n    atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\")\n
                                                                                                                                                                                  # To compare other plots it might be useful to use a canvas with atom.canvas(1, 2, figsize=(1800, 800)): atom.lgb_dfs.plot_roc(rows=\"test+train\") atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\") In\u00a0[21]: Copied!
                                                                                                                                                                                  # We can check the feature importance with other plots as well\natom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12)\n
                                                                                                                                                                                  # We can check the feature importance with other plots as well atom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12) In\u00a0[24]: Copied!
                                                                                                                                                                                  atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)\n
                                                                                                                                                                                  atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)"}, {"location": "examples/feature_engineering/#example-feature-engineering", "title": "Example: Feature engineering\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use automated feature generation to improve a model's performance.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/feature_engineering/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/feature_engineering/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/feature_engineering/#deep-feature-synthesis", "title": "Deep Feature Synthesis\u00b6", "text": ""}, {"location": "examples/feature_engineering/#genetic-feature-generation", "title": "Genetic Feature Generation\u00b6", "text": ""}, {"location": "examples/feature_engineering/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/getting_started/", "title": "Getting started", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  import pandas as pd\nfrom atom import ATOMClassifier\n\n# Load the Australian Weather dataset\nX = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\")\n
                                                                                                                                                                                  import pandas as pd from atom import ATOMClassifier # Load the Australian Weather dataset X = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\") In\u00a0[2]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (1000, 22)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 176.13 kB\nScaled: False\nMissing values: 2260 (10.3%)\nCategorical features: 5 (23.8%)\n\n
                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \natom.encode(strategy=\"Target\", max_onehot=8)\n
                                                                                                                                                                                  atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"Target\", max_onehot=8)
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Imputing 8 missing values with median (11.6) in feature MinTemp.\n --> Imputing 2 missing values with median (22.3) in feature MaxTemp.\n --> Imputing 12 missing values with median (0.0) in feature Rainfall.\n --> Imputing 425 missing values with median (4.8) in feature Evaporation.\n --> Imputing 480 missing values with median (8.55) in feature Sunshine.\n --> Imputing 59 missing values with most_frequent (N) in feature WindGustDir.\n --> Imputing 59 missing values with median (37.0) in feature WindGustSpeed.\n --> Imputing 90 missing values with most_frequent (N) in feature WindDir9am.\n --> Imputing 28 missing values with most_frequent (SW) in feature WindDir3pm.\n --> Imputing 10 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 19 missing values with median (17.0) in feature WindSpeed3pm.\n --> Imputing 17 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 31 missing values with median (51.0) in feature Humidity3pm.\n --> Imputing 89 missing values with median (1017.8) in feature Pressure9am.\n --> Imputing 87 missing values with median (1015.2) in feature Pressure3pm.\n --> Imputing 383 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 412 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 11 missing values with median (16.5) in feature Temp9am.\n --> Imputing 26 missing values with median (20.7) in feature Temp3pm.\n --> Imputing 12 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 49 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)\n
                                                                                                                                                                                  atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LDA, AdaB\nMetric: auc\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.162s |  0.162s | COMPLETE |\n| 1     |     svd |       nan |  0.8445 |   0.8807 |     0.147s |  0.309s | COMPLETE |\n| 2     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.310s | COMPLETE |\n| 3     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.311s | COMPLETE |\n| 4     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.312s | COMPLETE |\n| 5     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 6     |     svd |       nan |  0.8445 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 7     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.313s | COMPLETE |\n| 8     |   eigen |       0.5 |  0.8417 |   0.8807 |     0.143s |  0.456s | COMPLETE |\n| 9     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.457s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 0\nBest parameters:\n --> solver: eigen\n --> shrinkage: 0.9\nBest evaluation --> auc: 0.8807\nTime elapsed: 0.457s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.8381\nTest evaluation --> auc: 0.8037\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.482s\n\n\nRunning hyperparameter tuning for AdaBoost...\n| trial | n_estimators | learning_rate | algorithm |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |           90 |        0.4088 |   SAMME.R |  0.8002 |   0.8002 |     0.331s |  0.331s | COMPLETE |\n| 1     |          190 |        0.1019 |   SAMME.R |  0.8294 |   0.8294 |     0.540s |  0.871s | COMPLETE |\n| 2     |          260 |         0.243 |   SAMME.R |   0.754 |   0.8294 |     0.645s |  1.515s | COMPLETE |\n| 3     |          490 |         0.041 |   SAMME.R |  0.7953 |   0.8294 |     1.105s |  2.620s | COMPLETE |\n| 4     |          210 |        0.1604 |     SAMME |  0.7969 |   0.8294 |     0.527s |  3.148s | COMPLETE |\n| 5     |          310 |        0.1504 |     SAMME |  0.7988 |   0.8294 |     0.696s |  3.843s | COMPLETE |\n| 6     |          380 |         2.445 |     SAMME |  0.5978 |   0.8294 |     0.830s |  4.674s | COMPLETE |\n| 7     |          100 |        0.9151 |     SAMME |  0.8372 |   0.8372 |     0.328s |  5.002s | COMPLETE |\n| 8     |          350 |        8.9334 |     SAMME |  0.6751 |   0.8372 |     0.786s |  5.787s | COMPLETE |\n| 9     |          450 |        0.1974 |     SAMME |    0.82 |   0.8372 |     0.969s |  6.757s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 7\nBest parameters:\n --> n_estimators: 100\n --> learning_rate: 0.9151\n --> algorithm: SAMME\nBest evaluation --> auc: 0.8372\nTime elapsed: 6.757s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9133\nTest evaluation --> auc: 0.8353\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 6.989s\n\n\nFinal results ==================== >>\nTotal time: 9.134s\n-------------------------------------\nLinearDiscriminantAnalysis --> auc: 0.8037\nAdaBoost                   --> auc: 0.8353 !\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                  atom.evaluate() Out[5]: accuracy ap ba f1 jaccard mcc precision recall auc LDA 0.785 0.5888 0.7533 0.5825 0.4110 0.4542 0.5000 0.6977 0.8037 AdaB 0.820 0.5801 0.7165 0.5610 0.3898 0.4490 0.5897 0.5349 0.8353"}, {"location": "examples/getting_started/#example-getting-started", "title": "Example: Getting started\u00b6", "text": "

                                                                                                                                                                                  This example shows how to get started with the atom-ml library.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/holdout_set/", "title": "Holdout set", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom specifying a fraction of the dataset for holdout\natom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)\n
                                                                                                                                                                                  # Initialize atom specifying a fraction of the dataset for holdout atom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (56877, 22)\nTrain set size: 42658\nTest set size: 14219\nHoldout set size: 14219\n-------------------------------------\nMemory: 10.01 MB\nScaled: False\nMissing values: 126822 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 15 (0.0%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # The test and holdout fractions are split after subsampling the dataset\n# Also note that the holdout data set is not a part of atom's dataset\nprint(\"Length loaded data:\", len(X))\nprint(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))\n
                                                                                                                                                                                  # The test and holdout fractions are split after subsampling the dataset # Also note that the holdout data set is not a part of atom's dataset print(\"Length loaded data:\", len(X)) print(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))
                                                                                                                                                                                  Length loaded data: 142193\nLength dataset + holdout: 71096\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  atom.impute()\natom.encode()\n
                                                                                                                                                                                  atom.impute() atom.encode()
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 258 samples due to missing values in feature MinTemp.\n --> Dropping 127 samples due to missing values in feature MaxTemp.\n --> Dropping 553 samples due to missing values in feature Rainfall.\n --> Dropping 24308 samples due to missing values in feature Evaporation.\n --> Dropping 27187 samples due to missing values in feature Sunshine.\n --> Dropping 3739 samples due to missing values in feature WindGustDir.\n --> Dropping 3712 samples due to missing values in feature WindGustSpeed.\n --> Dropping 3995 samples due to missing values in feature WindDir9am.\n --> Dropping 1508 samples due to missing values in feature WindDir3pm.\n --> Dropping 539 samples due to missing values in feature WindSpeed9am.\n --> Dropping 1077 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 706 samples due to missing values in feature Humidity9am.\n --> Dropping 1447 samples due to missing values in feature Humidity3pm.\n --> Dropping 5610 samples due to missing values in feature Pressure9am.\n --> Dropping 5591 samples due to missing values in feature Pressure3pm.\n --> Dropping 21520 samples due to missing values in feature Cloud9am.\n --> Dropping 22921 samples due to missing values in feature Cloud3pm.\n --> Dropping 365 samples due to missing values in feature Temp9am.\n --> Dropping 1106 samples due to missing values in feature Temp3pm.\n --> Dropping 553 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Unlike train and test, the holdout data set is not transformed until used for predictions\natom.holdout\n
                                                                                                                                                                                  # Unlike train and test, the holdout data set is not transformed until used for predictions atom.holdout Out[6]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 22540 NorahHead 15.8 23.7 0.4 NaN NaN SSW 50.0 NW NaN ... 79.0 80.0 1012.4 1009.6 NaN NaN 18.4 18.9 No 0 22541 Brisbane 13.0 24.1 0.0 3.2 3.6 W 24.0 SW WSW ... 53.0 27.0 1019.9 1015.9 7.0 8.0 17.3 22.1 No 0 22542 MountGambier 14.7 36.2 0.0 7.2 12.5 S 33.0 N SSW ... 52.0 27.0 1018.8 1017.4 7.0 2.0 25.2 35.4 No 0 22543 Launceston 12.3 21.4 0.0 NaN NaN NNW 52.0 NNW NNW ... 62.0 60.0 NaN NaN 5.0 8.0 16.2 20.4 No 0 22544 MountGinini 3.2 10.0 0.0 NaN NaN WSW 52.0 WSW WSW ... 97.0 95.0 NaN NaN NaN NaN 6.5 8.4 No 0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 36754 MountGinini 1.6 4.4 0.0 NaN NaN E 52.0 E E ... 100.0 100.0 NaN NaN NaN NaN 2.7 2.6 No 1 36755 WaggaWagga 9.9 21.8 0.0 4.6 5.7 WSW 35.0 S SW ... 57.0 36.0 1015.5 1013.7 7.0 7.0 17.0 21.3 No 0 36756 Walpole 8.8 16.3 0.8 NaN NaN NNW 37.0 NNE N ... 84.0 79.0 1018.4 1013.5 NaN NaN 11.0 14.6 No 1 36757 Dartmoor 8.7 15.5 2.0 1.4 5.4 S 30.0 WSW SSW ... 100.0 94.0 1018.6 1020.0 NaN NaN 12.9 12.8 Yes 0 36758 SydneyAirport 16.8 22.6 8.4 5.0 3.8 S 57.0 WNW S ... 79.0 75.0 1013.2 1013.7 8.0 6.0 17.1 18.8 Yes 0

                                                                                                                                                                                  14219 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  atom.run(models=[\"GNB\", \"LR\", \"RF\"])\n
                                                                                                                                                                                  atom.run(models=[\"GNB\", \"LR\", \"RF\"])
                                                                                                                                                                                  \nTraining ========================= >>\nModels: GNB, LR, RF\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.604\nTest evaluation --> f1: 0.6063\nTime elapsed: 0.209s\n-------------------------------------------------\nTime: 0.209s\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6188\nTest evaluation --> f1: 0.6162\nTime elapsed: 0.323s\n-------------------------------------------------\nTime: 0.323s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.6084\nTime elapsed: 4.533s\n-------------------------------------------------\nTime: 4.533s\n\n\nFinal results ==================== >>\nTotal time: 5.734s\n-------------------------------------\nGaussianNB         --> f1: 0.6063\nLogisticRegression --> f1: 0.6162 !\nRandomForest       --> f1: 0.6084 ~\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.plot_prc()\n
                                                                                                                                                                                  atom.plot_prc() In\u00a0[9]: Copied!
                                                                                                                                                                                  # Based on the results on the test set, we select the best model for further tuning\natom.run(\"lr_tuned\", n_trials=10)\n
                                                                                                                                                                                  # Based on the results on the test set, we select the best model for further tuning atom.run(\"lr_tuned\", n_trials=10)
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR_tuned\nMetric: f1\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |    None |  0.1893 |     sag |      540 |      0.4 |  0.6096 |  0.6096 |     0.797s |  0.797s | COMPLETE |\n| 1     |      l2 |  0.6275 | newto.. |      150 |      0.7 |  0.6101 |  0.6101 |     0.637s |  1.433s | COMPLETE |\n| 2     |      l1 |  0.7457 | libli.. |      740 |      0.7 |  0.6114 |  0.6114 |     0.815s |  2.248s | COMPLETE |\n| 3     |      l2 |  0.0759 | newto.. |      290 |      0.4 |  0.6204 |  0.6204 |     0.634s |  2.882s | COMPLETE |\n| 4     |      l2 |  0.2122 | newto.. |      730 |      0.9 |  0.6273 |  0.6273 |     0.635s |  3.516s | COMPLETE |\n| 5     |      l2 |  0.0017 |   lbfgs |      260 |      1.0 |   0.589 |  0.6273 |     0.581s |  4.097s | COMPLETE |\n| 6     |      l2 |  0.0137 |     sag |      130 |      0.4 |  0.6092 |  0.6273 |     0.615s |  4.711s | COMPLETE |\n| 7     |    None |  0.0014 |     sag |      640 |      0.1 |  0.5909 |  0.6273 |     0.725s |  5.436s | COMPLETE |\n| 8     |      l2 |  0.0224 |     sag |      500 |      1.0 |  0.6226 |  0.6273 |     0.653s |  6.089s | COMPLETE |\n| 9     |      l1 |  0.1594 |    saga |      630 |      0.2 |  0.6236 |  0.6273 |     0.810s |  6.898s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> penalty: l2\n --> C: 0.2122\n --> solver: newton-cg\n --> max_iter: 730\n --> l1_ratio: 0.9\nBest evaluation --> f1: 0.6273\nTime elapsed: 6.898s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6188\nTest evaluation --> f1: 0.6172\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 7.251s\n\n\nFinal results ==================== >>\nTotal time: 7.461s\n-------------------------------------\nLogisticRegression --> f1: 0.6172\n

                                                                                                                                                                                  We already used the test set to choose the best model for futher tuning, so this set is no longer truly independent. Although it may not be directly visible in the results, using the test set now to evaluate the tuned LR model would be a mistake, since it carries a bias. For this reason, we have set apart an extra, indepedent set to validate the final model: the holdout set. If we are not going to use the test set for validation, we might as well use it to train the model and so optimize the use of the available data. Use the full_train method for this.

                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Re-train the model on the full dataset (train + test) \natom.lr_tuned.full_train()\n
                                                                                                                                                                                  # Re-train the model on the full dataset (train + test) atom.lr_tuned.full_train()
                                                                                                                                                                                  Fit ---------------------------------------------\nTrain evaluation --> f1: 0.6185\nTest evaluation --> f1: 0.6185\nTime elapsed: 0.717s\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # Evaluate on the holdout set\natom.lr_tuned.evaluate(rows=\"holdout\")\n
                                                                                                                                                                                  # Evaluate on the holdout set atom.lr_tuned.evaluate(rows=\"holdout\") Out[11]:
                                                                                                                                                                                  accuracy     0.8577\nap           0.7473\nba           0.7480\nf1           0.6352\njaccard      0.4654\nmcc          0.5606\nprecision    0.7559\nrecall       0.5477\nauc          0.8873\nName: LR_tuned, dtype: float64
                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                  atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")\n
                                                                                                                                                                                  atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")"}, {"location": "examples/holdout_set/#example-holdout-set", "title": "Example: Holdout set\u00b6", "text": "

                                                                                                                                                                                  This example shows when and how to use ATOM's holdout set in an exploration pipeline.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/holdout_set/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/holdout_set/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/holdout_set/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/", "title": "Hyperparameter tuning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.distributions import IntDistribution\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from optuna.distributions import IntDistribution from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)\n
                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 4 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Train a MultiLayerPerceptron model on two metrics\n# using a custom number of hidden layers\natom.run(\n    models=\"MLP\",\n    metric=[\"f1\", \"ap\"],\n    n_trials=10,\n    est_params={\"activation\": \"relu\"},\n    ht_params={\n        \"distributions\": {\n            \"hidden_layer_1\": IntDistribution(2, 4),\n            \"hidden_layer_2\": IntDistribution(10, 20),\n            \"hidden_layer_3\": IntDistribution(10, 20),\n            \"hidden_layer_4\": IntDistribution(2, 4),\n        }\n    }\n)\n
                                                                                                                                                                                  # Train a MultiLayerPerceptron model on two metrics # using a custom number of hidden layers atom.run( models=\"MLP\", metric=[\"f1\", \"ap\"], n_trials=10, est_params={\"activation\": \"relu\"}, ht_params={ \"distributions\": { \"hidden_layer_1\": IntDistribution(2, 4), \"hidden_layer_2\": IntDistribution(10, 20), \"hidden_layer_3\": IntDistribution(10, 20), \"hidden_layer_4\": IntDistribution(2, 4), } } )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: MLP\nMetric: f1, ap\n\n\nRunning hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |              3 |             17 |             10 |              2 |  0.9464 |  0.9464 |  0.9844 |  0.9844 |     9.139s |  9.139s | COMPLETE |\n| 1     |              2 |             11 |             12 |              3 |  0.9744 |  0.9744 |  0.9991 |  0.9991 |    11.466s | 20.605s | COMPLETE |\n| 2     |              3 |             15 |             14 |              4 |  0.9915 |  0.9915 |  0.9978 |  0.9991 |     8.570s | 29.175s | COMPLETE |\n| 3     |              2 |             19 |             10 |              4 |  0.9655 |  0.9915 |  0.9878 |  0.9991 |     9.208s | 38.383s | COMPLETE |\n| 4     |              3 |             16 |             11 |              2 |  0.9661 |  0.9915 |  0.9981 |  0.9991 |     0.657s | 39.039s | COMPLETE |\n| 5     |              4 |             20 |             13 |              4 |  0.9739 |  0.9915 |  0.9989 |  0.9991 |     0.623s | 39.662s | COMPLETE |\n| 6     |              4 |             19 |             10 |              2 |  0.9828 |  0.9915 |  0.9907 |  0.9991 |     0.601s | 40.263s | COMPLETE |\n| 7     |              2 |             19 |             11 |              3 |  0.7733 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 40.863s | COMPLETE |\n| 8     |              4 |             15 |             17 |              2 |  0.9915 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 41.464s | COMPLETE |\n| 9     |              4 |             19 |             10 |              4 |  0.9828 |  0.9915 |  0.9822 |  0.9997 |     0.599s | 42.062s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 8\nBest parameters:\n --> hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --> f1: 0.9915   ap: 0.9997\nTime elapsed: 42.062s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965   ap: 0.9991\nTest evaluation --> f1: 0.9718   ap: 0.9938\nTime elapsed: 1.515s\n-------------------------------------------------\nTime: 43.578s\n\n\nFinal results ==================== >>\nTotal time: 43.815s\n-------------------------------------\nMultiLayerPerceptron --> f1: 0.9718   ap: 0.9938\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # For multi-metric runs, the selected best trial is the first in the Pareto front\natom.mlp.best_trial\n
                                                                                                                                                                                  # For multi-metric runs, the selected best trial is the first in the Pareto front atom.mlp.best_trial Out[5]:
                                                                                                                                                                                  FrozenTrial(number=8, state=1, values=[0.9914529914529915, 0.9997077732320282], datetime_start=datetime.datetime(2023, 11, 4, 19, 13, 50, 113304), datetime_complete=datetime.datetime(2023, 11, 4, 19, 13, 50, 713850), params={'hidden_layer_1': 4, 'hidden_layer_2': 15, 'hidden_layer_3': 17, 'hidden_layer_4': 2}, user_attrs={'estimator': MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2), random_state=1)}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'hidden_layer_1': IntDistribution(high=4, log=False, low=2, step=1), 'hidden_layer_2': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_3': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_4': IntDistribution(high=4, log=False, low=2, step=1)}, trial_id=8, value=None)
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.plot_pareto_front()\n
                                                                                                                                                                                  atom.plot_pareto_front() In\u00a0[7]: Copied!
                                                                                                                                                                                  # If you are unhappy with the results, it's possible to conitnue the study\natom.mlp.hyperparameter_tuning(n_trials=5)\n
                                                                                                                                                                                  # If you are unhappy with the results, it's possible to conitnue the study atom.mlp.hyperparameter_tuning(n_trials=5)
                                                                                                                                                                                  Running hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 10    |              4 |             18 |             13 |              4 |  0.9831 |  0.9915 |  0.9997 |  0.9997 |     0.673s | 42.735s | COMPLETE |\n| 11    |              2 |             14 |             19 |              2 |  0.9421 |  0.9915 |  0.9899 |  0.9997 |     0.604s | 43.339s | COMPLETE |\n| 12    |              2 |             11 |             10 |              4 |  0.7733 |  0.9915 |    0.99 |  0.9997 |     0.617s | 43.955s | COMPLETE |\n| 13    |              2 |             12 |             15 |              2 |  0.9558 |  0.9915 |  0.9985 |  0.9997 |     0.595s | 44.550s | COMPLETE |\n| 14    |              3 |             11 |             16 |              4 |  0.7733 |  0.9915 |  0.9721 |  0.9997 |     0.663s | 45.212s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 8\nBest parameters:\n --> hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --> f1: 0.9915   ap: 0.9997\nTime elapsed: 45.212s\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # The trials attribute gives an overview of the trial results\natom.mlp.trials\n
                                                                                                                                                                                  # The trials attribute gives an overview of the trial results atom.mlp.trials Out[8]: hidden_layer_1 hidden_layer_2 hidden_layer_3 hidden_layer_4 estimator f1 best_f1 ap best_ap time_trial time_ht state trial 0 3 17 10 2 MLPClassifier(hidden_layer_sizes=(3, 17, 10, 2... 0.946429 0.991453 0.984402 0.999708 9.138911 9.138911 COMPLETE 1 2 11 12 3 MLPClassifier(hidden_layer_sizes=(2, 11, 12, 3... 0.974359 0.991453 0.999128 0.999708 11.466475 20.605386 COMPLETE 2 3 15 14 4 MLPClassifier(hidden_layer_sizes=(3, 15, 14, 4... 0.991453 0.991453 0.997842 0.999708 8.569545 29.174931 COMPLETE 3 2 19 10 4 MLPClassifier(hidden_layer_sizes=(2, 19, 10, 4... 0.965517 0.991453 0.987805 0.999708 9.207920 38.382851 COMPLETE 4 3 16 11 2 MLPClassifier(hidden_layer_sizes=(3, 16, 11, 2... 0.966102 0.991453 0.998086 0.999708 0.656597 39.039448 COMPLETE 5 4 20 13 4 MLPClassifier(hidden_layer_sizes=(4, 20, 13, 4... 0.973913 0.991453 0.998855 0.999708 0.622566 39.662014 COMPLETE 6 4 19 10 2 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 2... 0.982759 0.991453 0.990748 0.999708 0.600547 40.262561 COMPLETE 7 2 19 11 3 MLPClassifier(hidden_layer_sizes=(2, 19, 11, 3... 0.773333 0.991453 0.999708 0.999708 0.600546 40.863107 COMPLETE 8 4 15 17 2 MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2... 0.991453 0.991453 0.999708 0.999708 0.600546 41.463653 COMPLETE 9 4 19 10 4 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 4... 0.982759 0.991453 0.982168 0.999708 0.598815 42.062468 COMPLETE 10 4 18 13 4 MLPClassifier(hidden_layer_sizes=(4, 18, 13, 4... 0.983051 0.991453 0.999708 0.999708 0.672611 42.735079 COMPLETE 11 2 14 19 2 MLPClassifier(hidden_layer_sizes=(2, 14, 19, 2... 0.942149 0.991453 0.989914 0.999708 0.603549 43.338628 COMPLETE 12 2 11 10 4 MLPClassifier(hidden_layer_sizes=(2, 11, 10, 4... 0.773333 0.991453 0.990024 0.999708 0.616561 43.955189 COMPLETE 13 2 12 15 2 MLPClassifier(hidden_layer_sizes=(2, 12, 15, 2... 0.955752 0.991453 0.998518 0.999708 0.594541 44.549730 COMPLETE 14 3 11 16 4 MLPClassifier(hidden_layer_sizes=(3, 11, 16, 4... 0.773333 0.991453 0.972070 0.999708 0.662602 45.212332 COMPLETE In\u00a0[9]: Copied!
                                                                                                                                                                                  # Select a custom best trial...\natom.mlp.best_trial = 2\n\n# ...and check that the best parameters are now those in the selected trial\natom.mlp.best_params\n
                                                                                                                                                                                  # Select a custom best trial... atom.mlp.best_trial = 2 # ...and check that the best parameters are now those in the selected trial atom.mlp.best_params Out[9]:
                                                                                                                                                                                  {'hidden_layer_sizes': (3, 15, 14, 4)}
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Lastly, fit the model on the complete training set \n# using the new combination of hyperparameters\natom.mlp.fit()\n
                                                                                                                                                                                  # Lastly, fit the model on the complete training set # using the new combination of hyperparameters atom.mlp.fit()
                                                                                                                                                                                  Fit ---------------------------------------------\nTrain evaluation --> f1: 0.9983   ap: 0.9998\nTest evaluation --> f1: 0.9718   ap: 0.9947\nTime elapsed: 3.048s\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  atom.plot_trials()\n
                                                                                                                                                                                  atom.plot_trials() In\u00a0[12]: Copied!
                                                                                                                                                                                  atom.plot_parallel_coordinate()\n
                                                                                                                                                                                  atom.plot_parallel_coordinate()"}, {"location": "examples/hyperparameter_tuning/#example-hyperparameter-tuning", "title": "Example: Hyperparameter tuning\u00b6", "text": "

                                                                                                                                                                                  This example shows an advanced example on how to optimize your model's hyperparameters for multi-metric runs.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/hyperparameter_tuning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/", "title": "Imbalanced datasets", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n
                                                                                                                                                                                  # Import packages from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied!
                                                                                                                                                                                  # Create a mock imbalanced dataset\nX, y = make_classification(\n    n_samples=5000,\n    n_features=30,\n    n_informative=20,\n    weights=(0.95,),\n    random_state=1,\n)\n
                                                                                                                                                                                  # Create a mock imbalanced dataset X, y = make_classification( n_samples=5000, n_features=30, n_informative=20, weights=(0.95,), random_state=1, ) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)\n
                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (5000, 31)\nTrain set size: 4000\nTest set size: 1000\n-------------------------------------\nMemory: 1.24 MB\nScaled: False\nOutlier values: 570 (0.5%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Let's have a look at the data. Note that, since the input wasn't\n# a dataframe, atom has given default names to the columns.\natom.head()\n
                                                                                                                                                                                  # Let's have a look at the data. Note that, since the input wasn't # a dataframe, atom has given default names to the columns. atom.head() Out[4]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x21 x22 x23 x24 x25 x26 x27 x28 x29 target 0 -0.535760 -2.426045 1.256836 0.374501 -3.241958 -1.239468 -0.208750 -6.015995 3.698669 0.112512 ... 0.044302 -1.935727 10.870353 0.286755 -2.416507 0.556990 -1.522635 3.719201 1.449135 0 1 -3.311935 -3.149920 -0.801252 -2.644414 -0.704889 -3.312256 0.714515 2.992345 5.056910 3.036775 ... 2.224359 0.451273 -1.822108 -1.435801 0.036132 -1.364583 1.215663 5.232161 1.408798 0 2 3.821199 1.328129 -1.000720 -13.151697 0.254253 1.263636 -1.088451 4.924264 -1.225646 -6.974824 ... 3.541222 1.686667 -13.763703 -1.321256 1.677687 0.774966 -5.067689 4.663386 -1.714186 0 3 5.931126 3.338830 0.545906 2.296355 -3.941088 3.527252 -0.158770 3.138381 -0.927460 -1.642079 ... -3.634442 7.853176 -8.457598 0.000490 -2.612756 -1.138206 0.497150 4.351289 -0.321748 0 4 -2.829472 -1.227185 -0.751892 3.056106 -1.988920 -2.219184 -0.075882 5.790102 -2.786671 2.023458 ... 4.057954 1.178564 -15.028187 1.627140 -1.093587 -0.422655 1.777011 6.660638 -2.553723 0

                                                                                                                                                                                  5 rows \u00d7 31 columns

                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Let's start reducing the number of features\natom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)\n
                                                                                                                                                                                  # Let's start reducing the number of features atom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)
                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 12 features from the dataset.\n   --> Dropping feature x1 (rank 8).\n   --> Dropping feature x2 (rank 11).\n   --> Dropping feature x4 (rank 3).\n   --> Dropping feature x6 (rank 16).\n   --> Dropping feature x7 (rank 14).\n   --> Dropping feature x10 (rank 19).\n   --> Dropping feature x12 (rank 13).\n   --> Dropping feature x13 (rank 12).\n   --> Dropping feature x14 (rank 9).\n   --> Dropping feature x16 (rank 10).\n   --> Dropping feature x18 (rank 17).\n   --> Dropping feature x19 (rank 2).\n   --> Dropping feature x20 (rank 4).\n   --> Dropping feature x22 (rank 7).\n   --> Dropping feature x23 (rank 5).\n   --> Dropping feature x24 (rank 18).\n   --> Dropping feature x25 (rank 6).\n   --> Dropping feature x26 (rank 15).\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Fit a model directly on the imbalanced data\natom.run(\"RF\", metric=\"ba\")\n
                                                                                                                                                                                  # Fit a model directly on the imbalanced data atom.run(\"RF\", metric=\"ba\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.5556\nTime elapsed: 2.497s\n-------------------------------------------------\nTime: 2.497s\n\n\nFinal results ==================== >>\nTotal time: 2.568s\n-------------------------------------\nRandomForest --> ba: 0.5556 ~\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # The transformer and the models have been added to the branch\natom.branch\n
                                                                                                                                                                                  # The transformer and the models have been added to the branch atom.branch Out[8]:
                                                                                                                                                                                  Branch(main)
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # Create a new branch for oversampling\natom.branch = \"oversample\"\n
                                                                                                                                                                                  # Create a new branch for oversampling atom.branch = \"oversample\"
                                                                                                                                                                                  Successfully created new branch: oversample.\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Perform oversampling of the minority class\natom.balance(strategy=\"smote\")\n
                                                                                                                                                                                  # Perform oversampling of the minority class atom.balance(strategy=\"smote\")
                                                                                                                                                                                  Oversampling with SMOTE...\n --> Adding 3570 samples to class 1.\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  atom.classes  # Check the balanced training set!\n
                                                                                                                                                                                  atom.classes # Check the balanced training set! Out[11]: dataset train test 0 4731 3785 946 1 3839 3785 54 In\u00a0[12]: Copied!
                                                                                                                                                                                  # Train another model on the new branch. Add a tag after \n# the model's acronym to distinguish it from the first model\natom.run(\"rf_os\")  # os for oversample\n
                                                                                                                                                                                  # Train another model on the new branch. Add a tag after # the model's acronym to distinguish it from the first model atom.run(\"rf_os\") # os for oversample
                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF_os\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.7672\nTime elapsed: 4.136s\n-------------------------------------------------\nTime: 4.136s\n\n\nFinal results ==================== >>\nTotal time: 4.248s\n-------------------------------------\nRandomForest --> ba: 0.7672 ~\n
                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  # Create the undersampling branch\n# Split from master to not adopt the oversmapling transformer\natom.branch = \"undersample_from_main\"\n
                                                                                                                                                                                  # Create the undersampling branch # Split from master to not adopt the oversmapling transformer atom.branch = \"undersample_from_main\"
                                                                                                                                                                                  Successfully created new branch: undersample.\n
                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                  atom.classes  # In this branch, the data is still imbalanced\n
                                                                                                                                                                                  atom.classes # In this branch, the data is still imbalanced Out[15]: dataset train test 0 4731 3785 946 1 269 215 54 In\u00a0[16]: Copied!
                                                                                                                                                                                  # Perform undersampling of the majority class\natom.balance(strategy=\"NearMiss\")\n
                                                                                                                                                                                  # Perform undersampling of the majority class atom.balance(strategy=\"NearMiss\")
                                                                                                                                                                                  Undersampling with NearMiss...\n --> Removing 3570 samples from class 0.\n
                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                  atom.run(\"rf_us\")\n
                                                                                                                                                                                  atom.run(\"rf_us\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF_us\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.6706\nTime elapsed: 0.285s\n-------------------------------------------------\nTime: 0.285s\n\n\nFinal results ==================== >>\nTotal time: 0.321s\n-------------------------------------\nRandomForest --> ba: 0.6706 ~\n
                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                  # Check that the branch only contains the desired transformers \natom.branch\n
                                                                                                                                                                                  # Check that the branch only contains the desired transformers atom.branch Out[18]:
                                                                                                                                                                                  Branch(undersample)
                                                                                                                                                                                  In\u00a0[19]: Copied!
                                                                                                                                                                                  # Visualize the complete pipeline\natom.plot_pipeline()\n
                                                                                                                                                                                  # Visualize the complete pipeline atom.plot_pipeline() In\u00a0[20]: Copied!
                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                  atom.evaluate() Out[20]: accuracy ap ba f1 jaccard mcc precision recall auc RF 0.952 0.6562 0.5556 0.2000 0.1111 0.3252 1.000 0.1111 0.9107 RF_os 0.956 0.6215 0.7672 0.5769 0.4054 0.5542 0.600 0.5556 0.9251 RF_us 0.509 0.3687 0.6706 0.1578 0.0857 0.1545 0.087 0.8519 0.8258 In\u00a0[21]: Copied!
                                                                                                                                                                                  atom.plot_prc()\n
                                                                                                                                                                                  atom.plot_prc() In\u00a0[22]: Copied!
                                                                                                                                                                                  atom.plot_roc()\n
                                                                                                                                                                                  atom.plot_roc()"}, {"location": "examples/imbalanced_datasets/#example-imbalanced-datasets", "title": "Example: Imbalanced datasets\u00b6", "text": "

                                                                                                                                                                                  This example shows how ATOM can help you handle imbalanced datasets. We will evaluate the performance of three different Random Forest models: one trained directly on the imbalanced dataset, one trained on an oversampled dataset and the last one trained on an undersampled dataset.

                                                                                                                                                                                  "}, {"location": "examples/imbalanced_datasets/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#oversampling", "title": "Oversampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#undersampling", "title": "Undersampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/in_training_validation/", "title": "In-training validation", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Not all models support in-training validation\n# You can chek which ones do using the available_models method\ndf = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\ndf[df[\"has_validation\"]]\n
                                                                                                                                                                                  # Not all models support in-training validation # You can chek which ones do using the available_models method df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]] df[df[\"has_validation\"]] Out[4]: acronym model has_validation 3 CatB CatBoost True 15 LGB LightGBM True 19 MLP MultiLayerPerceptron True 21 PA PassiveAggressive True 22 Perc Perceptron True 27 SGD StochasticGradientDescent True 29 XGB XGBoost True In\u00a0[5]: Copied!
                                                                                                                                                                                  # Run the models normally\natom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")\n
                                                                                                                                                                                  # Run the models normally atom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: MLP, LGB\nMetric: auc\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9997\nTest evaluation --> auc: 0.9936\nTime elapsed: 1.821s\n-------------------------------------------------\nTime: 1.821s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 1.0\nTest evaluation --> auc: 0.9775\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 0.352s\n\n\nFinal results ==================== >>\nTotal time: 2.236s\n-------------------------------------\nMultiLayerPerceptron --> auc: 0.9936 !\nLightGBM             --> auc: 0.9775\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.plot_evals(title=\"In-training validation scores\")\n
                                                                                                                                                                                  atom.plot_evals(title=\"In-training validation scores\") In\u00a0[7]: Copied!
                                                                                                                                                                                  # Plot the validation on the train and test set\natom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")\n
                                                                                                                                                                                  # Plot the validation on the train and test set atom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")"}, {"location": "examples/in_training_validation/#example-in-training-validation", "title": "Example: In-training validation\u00b6", "text": "

                                                                                                                                                                                  This example shows how to keep track of the model's performance during training.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/in_training_validation/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/in_training_validation/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/in_training_validation/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/memory_considerations/", "title": "Memory considerations", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport os\nimport tempfile\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import os import tempfile import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  # Define a temp directory to store the files in this example\ntempdir = tempfile.gettempdir()\n
                                                                                                                                                                                  # Define a temp directory to store the files in this example tempdir = tempfile.gettempdir() In\u00a0[4]: Copied!
                                                                                                                                                                                  def get_size(filepath):\n    \"\"\"Return the size of the object in MB.\"\"\"\n    return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\"\n
                                                                                                                                                                                  def get_size(filepath): \"\"\"Return the size of the object in MB.\"\"\" return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\" In\u00a0[5]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n

                                                                                                                                                                                  Note that the datset takes ~25MB. We can reduce the size of the dataset using the shrink method, which reduces the dtypes to their smallest possible value.

                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.dtypes\n
                                                                                                                                                                                  atom.dtypes Out[6]:
                                                                                                                                                                                  Location          object\nMinTemp          float64\nMaxTemp          float64\nRainfall         float64\nEvaporation      float64\nSunshine         float64\nWindGustDir       object\nWindGustSpeed    float64\nWindDir9am        object\nWindDir3pm        object\nWindSpeed9am     float64\nWindSpeed3pm     float64\nHumidity9am      float64\nHumidity3pm      float64\nPressure9am      float64\nPressure3pm      float64\nCloud9am         float64\nCloud3pm         float64\nTemp9am          float64\nTemp3pm          float64\nRainToday         object\nRainTomorrow       int64\ndtype: object
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  atom.shrink(str2cat=True)\n
                                                                                                                                                                                  atom.shrink(str2cat=True)
                                                                                                                                                                                  The column dtypes are successfully converted.\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.dtypes\n
                                                                                                                                                                                  atom.dtypes Out[8]:
                                                                                                                                                                                  Location         category\nMinTemp           Float32\nMaxTemp           Float32\nRainfall          Float32\nEvaporation       Float32\nSunshine          Float32\nWindGustDir      category\nWindGustSpeed       Int16\nWindDir9am       category\nWindDir3pm       category\nWindSpeed9am        Int16\nWindSpeed3pm         Int8\nHumidity9am          Int8\nHumidity3pm          Int8\nPressure9am       Float32\nPressure3pm       Float32\nCloud9am             Int8\nCloud3pm             Int8\nTemp9am           Float32\nTemp3pm           Float32\nRainToday        category\nRainTomorrow         Int8\ndtype: object
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # Let's check the memory usage again...\n# Notice the huge drop!\natom.stats()\n
                                                                                                                                                                                  # Let's check the memory usage again... # Notice the huge drop! atom.stats()
                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 9.67 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Now, we create some new branches to train models with different trasnformers\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n
                                                                                                                                                                                  # Now, we create some new branches to train models with different trasnformers atom.impute() atom.encode() atom.run(\"LDA\") atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\") atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\")
                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 637 samples due to missing values in feature MinTemp.\n --> Dropping 322 samples due to missing values in feature MaxTemp.\n --> Dropping 1406 samples due to missing values in feature Rainfall.\n --> Dropping 60843 samples due to missing values in feature Evaporation.\n --> Dropping 67816 samples due to missing values in feature Sunshine.\n --> Dropping 9330 samples due to missing values in feature WindGustDir.\n --> Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --> Dropping 10013 samples due to missing values in feature WindDir9am.\n --> Dropping 3778 samples due to missing values in feature WindDir3pm.\n --> Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --> Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 1774 samples due to missing values in feature Humidity9am.\n --> Dropping 3610 samples due to missing values in feature Humidity3pm.\n --> Dropping 14014 samples due to missing values in feature Pressure9am.\n --> Dropping 13981 samples due to missing values in feature Pressure3pm.\n --> Dropping 53657 samples due to missing values in feature Cloud9am.\n --> Dropping 57094 samples due to missing values in feature Cloud3pm.\n --> Dropping 904 samples due to missing values in feature Temp9am.\n --> Dropping 2726 samples due to missing values in feature Temp3pm.\n --> Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6213\nTest evaluation --> f1: 0.6341\nTime elapsed: 0.375s\n-------------------------------------------------\nTime: 0.375s\n\n\nFinal results ==================== >>\nTotal time: 0.613s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6341\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= >>\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6213\nTest evaluation --> f1: 0.6341\nTime elapsed: 0.390s\n-------------------------------------------------\nTime: 0.390s\n\n\nFinal results ==================== >>\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6341\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= >>\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6267\nTest evaluation --> f1: 0.6368\nTime elapsed: 0.369s\n-------------------------------------------------\nTime: 0.369s\n\n\nFinal results ==================== >>\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6368\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # If we save atom now, notice the size\n# This is because atom keeps a copy of every branch in memory\nfilename = tempdir + \"atom1\"\natom.save(filename)\nget_size(filename)\n
                                                                                                                                                                                  # If we save atom now, notice the size # This is because atom keeps a copy of every branch in memory filename = tempdir + \"atom1\" atom.save(filename) get_size(filename)
                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                  Out[11]:
                                                                                                                                                                                  '34.92MB'

                                                                                                                                                                                  To avoid large memory usages, set the memory parameter.

                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) atom.impute() atom.encode() atom.run(\"LDA\") atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\") atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\")
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6233\nTest evaluation --> f1: 0.6248\nTime elapsed: 0.445s\n-------------------------------------------------\nTime: 0.445s\n\n\nFinal results ==================== >>\nTotal time: 0.708s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6248\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= >>\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6233\nTest evaluation --> f1: 0.6248\nTime elapsed: 0.454s\n-------------------------------------------------\nTime: 0.454s\n\n\nFinal results ==================== >>\nTotal time: 0.737s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6248\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= >>\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6279\nTest evaluation --> f1: 0.6298\nTime elapsed: 0.447s\n-------------------------------------------------\nTime: 0.447s\n\n\nFinal results ==================== >>\nTotal time: 0.740s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6298\n
                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                  # And now, it only takes a fraction of the previous size\n# This is because the data of inactive branches is now stored locally\nfilename = tempdir + \"atom2\"\natom.save(filename)\nget_size(filename)\n
                                                                                                                                                                                  # And now, it only takes a fraction of the previous size # This is because the data of inactive branches is now stored locally filename = tempdir + \"atom2\" atom.save(filename) get_size(filename)
                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                  Out[13]:
                                                                                                                                                                                  '9.63MB'

                                                                                                                                                                                  Additionnaly, repeated calls to the same transformers with the same data will use the cached results. Don't forget to specify the random_state parameter to ensure the data remains the exact same.

                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\n
                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                  # Note the transformers are no longer fitted,\n# instead the results are immediately read from cache\natom.impute()\natom.encode()\n
                                                                                                                                                                                  # Note the transformers are no longer fitted, # instead the results are immediately read from cache atom.impute() atom.encode()
                                                                                                                                                                                  Retrieving cached results for Imputer...\nRetrieving cached results for Encoder...\nEncoding categorical columns...\n
                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                  atom.dataset\n
                                                                                                                                                                                  atom.dataset Out[16]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 0.075703 13.0 30.5 0.0 6.8 10.0 0.271668 59 0.312069 0.273733 ... 19 8 1013.599976 1008.0 0 2 19.6 29.9 0.0 0 1 0.245394 15.3 22.4 16.0 4.2 3.3 0.204934 39 0.236475 0.199626 ... 83 63 1025.5 1023.599976 6 6 16.9 21.1 1.0 1 2 0.262397 27.9 34.5 0.0 9.0 7.9 0.1737 72 0.236475 0.306935 ... 72 63 1009.0 1005.5 7 7 31.0 33.099998 0.0 1 3 0.239174 12.9 27.9 0.0 5.4 8.6 0.269421 39 0.256213 0.286159 ... 69 56 1023.400024 1019.799988 7 7 14.7 23.4 0.0 0 4 0.253089 7.4 14.3 0.8 2.8 4.0 0.210095 31 0.269333 0.167808 ... 84 62 1023.599976 1023.200012 4 7 9.0 13.6 0.0 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 56415 0.295559 23.9 28.1 0.0 2.6 7.7 0.241448 44 0.279553 0.259391 ... 86 79 1015.900024 1013.900024 7 7 25.799999 27.5 0.0 0 56416 0.217037 13.6 24.6 0.0 4.4 7.8 0.1737 39 0.193908 0.197102 ... 87 61 1023.200012 1022.599976 7 3 17.299999 21.4 0.0 0 56417 0.112176 16.299999 38.700001 0.0 10.2 13.4 0.1737 24 0.149795 0.168702 ... 29 8 1013.5 1010.299988 5 2 26.4 36.900002 0.0 0 56418 0.295559 11.5 19.200001 0.8 2.0 7.0 0.147458 22 0.13795 0.195807 ... 73 52 1021.299988 1018.799988 3 4 17.1 18.4 0.0 0 56419 0.403054 5.9 18.0 0.4 0.8 6.7 0.269421 26 0.312069 0.286159 ... 92 65 1028.0 1025.300049 3 2 9.4 16.6 0.0 0

                                                                                                                                                                                  56420 rows \u00d7 22 columns

                                                                                                                                                                                  "}, {"location": "examples/memory_considerations/#example-memory-considerations", "title": "Example: Memory considerations\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use the memory parameter to make efficient use of the available memory.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/memory_considerations/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/memory_considerations/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/", "title": "Multi-metric runs", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\") # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 189 (0.6%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  atom.encode()\n
                                                                                                                                                                                  atom.encode()
                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> OneHot-encoding feature Sex. Contains 3 classes.\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # For every step of the BO, both metrics are calculated,\n# but only the first is used for optimization!\natom.run(\n    models=[\"lsvm\", \"hGBM\"],\n    metric=(\"r2\", \"rmse\"),\n    n_trials=10,\n    n_bootstrap=6,\n)\n
                                                                                                                                                                                  # For every step of the BO, both metrics are calculated, # but only the first is used for optimization! atom.run( models=[\"lsvm\", \"hGBM\"], metric=(\"r2\", \"rmse\"), n_trials=10, n_bootstrap=6, )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: lSVM, hGBM\nMetric: r2, rmse\n\n\nRunning hyperparameter tuning for LinearSVM...\n| trial |                    loss |       C |    dual |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | ----------------------- | ------- | ------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | squared_epsilon_insen.. |   0.001 |    True |  0.2887 |  0.2887 | -2.6528 |   -2.6528 |     0.043s |  0.043s | COMPLETE |\n| 1     | squared_epsilon_insen.. |  0.0534 |   False |  0.3862 |  0.3862 | -2.5926 |   -2.5926 |     0.043s |  0.086s | COMPLETE |\n| 2     | squared_epsilon_insen.. |  0.0105 |    True |   0.433 |   0.433 | -2.4084 |   -2.4084 |     0.054s |  0.140s | COMPLETE |\n| 3     |     epsilon_insensitive |  0.6215 |    True |  0.4022 |   0.433 | -2.5251 |   -2.4084 |     0.045s |  0.185s | COMPLETE |\n| 4     | squared_epsilon_insen.. |  0.0369 |   False |  0.4057 |   0.433 | -2.5477 |   -2.4084 |     0.040s |  0.225s | COMPLETE |\n| 5     |     epsilon_insensitive |  0.0016 |    True | -1.5344 |   0.433 | -5.0102 |   -2.4084 |     0.035s |  0.260s | COMPLETE |\n| 6     | squared_epsilon_insen.. | 61.5811 |   False |  0.4354 |  0.4354 | -2.3845 |   -2.3845 |     0.034s |  0.294s | COMPLETE |\n| 7     | squared_epsilon_insen.. |  14.898 |   False |  0.4925 |  0.4925 | -2.2628 |   -2.2628 |     0.035s |  0.329s | COMPLETE |\n| 8     |     epsilon_insensitive |  0.0252 |    True |  0.3695 |  0.4925 | -2.6178 |   -2.2628 |     0.035s |  0.364s | COMPLETE |\n| 9     | squared_epsilon_insen.. |  0.0294 |    True |  0.4767 |  0.4925 | -2.3896 |   -2.2628 |     0.044s |  0.408s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 7\nBest parameters:\n --> loss: squared_epsilon_insensitive\n --> C: 14.898\n --> dual: False\nBest evaluation --> r2: 0.4925   rmse: -2.2628\nTime elapsed: 0.408s\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.4592   rmse: -2.3795\nTest evaluation --> r2: 0.4584   rmse: -2.3369\nTime elapsed: 0.089s\nBootstrap ---------------------------------------\nEvaluation --> r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.592s\n\n\nRunning hyperparameter tuning for HistGradientBoosting...\n| trial |      loss | quantile | learning_rate | max_iter | max_leaf_nodes | max_depth | min_samples_leaf | l2_regularization |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | --------- | -------- | ------------- | -------- | -------------- | --------- | ---------------- | ----------------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | absolut.. |      0.1 |        0.0236 |      180 |             26 |        12 |               11 |               0.0 |  0.5373 |  0.5373 | -2.1398 |   -2.1398 |     0.968s |  0.968s | COMPLETE |\n| 1     |     gamma |      0.5 |         0.242 |      160 |             38 |         3 |               20 |               0.0 |   0.574 |   0.574 | -2.1598 |   -2.1398 |     0.160s |  1.128s | COMPLETE |\n| 2     |  quantile |      0.4 |        0.2448 |      210 |             12 |         3 |               25 |               0.3 |  0.4714 |   0.574 | -2.3253 |   -2.1398 |     0.422s |  1.550s | COMPLETE |\n| 3     |  quantile |      0.6 |         0.017 |      480 |             28 |        16 |               13 |               0.1 |  0.5712 |   0.574 | -2.1385 |   -2.1385 |     3.405s |  4.956s | COMPLETE |\n| 4     | squared.. |      1.0 |        0.2649 |       70 |             10 |        10 |               28 |               0.8 |  0.5561 |   0.574 | -2.2019 |   -2.1385 |     0.148s |  5.104s | COMPLETE |\n| 5     | squared.. |      0.1 |        0.0283 |      360 |             32 |         9 |               11 |               0.5 |  0.5464 |   0.574 | -2.1197 |   -2.1197 |     1.248s |  6.352s | COMPLETE |\n| 6     |  quantile |      0.4 |        0.1264 |      380 |             37 |        12 |               29 |               1.0 |  0.4416 |   0.574 | -2.3713 |   -2.1197 |     3.002s |  9.354s | COMPLETE |\n| 7     |     gamma |      0.6 |         0.678 |      330 |             25 |         6 |               12 |               0.8 |  0.4299 |   0.574 | -2.3984 |   -2.1197 |     0.739s | 10.092s | COMPLETE |\n| 8     | absolut.. |      0.9 |        0.0831 |      280 |             42 |        16 |               10 |               1.0 |  0.5242 |   0.574 | -2.2742 |   -2.1197 |     2.002s | 12.094s | COMPLETE |\n| 9     | absolut.. |      0.6 |        0.0373 |      300 |             40 |        13 |               17 |               0.8 |  0.5685 |   0.574 |   -2.17 |   -2.1197 |     1.859s | 13.953s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 5\nBest parameters:\n --> loss: squared_error\n --> quantile: 0.1\n --> learning_rate: 0.0283\n --> max_iter: 360\n --> max_leaf_nodes: 32\n --> max_depth: 9\n --> min_samples_leaf: 11\n --> l2_regularization: 0.5\nBest evaluation --> r2: 0.5464   rmse: -2.1197\nTime elapsed: 13.953s\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.7959   rmse: -1.4619\nTest evaluation --> r2: 0.5479   rmse: -2.1351\nTime elapsed: 1.470s\nBootstrap ---------------------------------------\nEvaluation --> r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352\nTime elapsed: 7.930s\n-------------------------------------------------\nTime: 23.353s\n\n\nFinal results ==================== >>\nTotal time: 25.299s\n-------------------------------------\nLinearSVM            --> r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nHistGradientBoosting --> r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352 ~ !\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Check the robustness of the pipeline using cross-validation\natom.winner.cross_validate()\n
                                                                                                                                                                                  # Check the robustness of the pipeline using cross-validation atom.winner.cross_validate()
                                                                                                                                                                                  Applying cross-validation...\n
                                                                                                                                                                                  Out[6]: train_r2 test_r2 train_rmse test_rmse time (s) 0 0.796038 0.541990 -1.453147 -2.196943 1.392266 1 0.794954 0.540424 -1.457709 -2.196179 1.436932 2 0.790722 0.505922 -1.492522 -2.153457 1.444314 3 0.785317 0.580703 -1.474827 -2.189902 1.432303 4 0.795872 0.547917 -1.461929 -2.135072 1.747591 mean 0.792581 0.543391 -1.468027 -2.174311 1.490681 std 0.004114 0.023780 0.014222 0.025330 0.129719 In\u00a0[8]: Copied!
                                                                                                                                                                                  # The columns in the results dataframe contain one for each metric\natom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]]\n
                                                                                                                                                                                  # The columns in the results dataframe contain one for each metric atom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]] Out[8]: r2_ht r2_train r2_test rmse_ht rmse_train rmse_test lSVM 0.492530 0.4583 0.4552 -2.262754 -2.3815 -2.3439 hGBM 0.546368 0.7183 0.4971 -2.119672 -1.7173 -2.2518 In\u00a0[9]: Copied!
                                                                                                                                                                                  # Some plots allow us to choose the metric we want to show\nwith atom.canvas():\n    atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")\n    atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\")\n
                                                                                                                                                                                  # Some plots allow us to choose the metric we want to show with atom.canvas(): atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\") atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\") In\u00a0[10]: Copied!
                                                                                                                                                                                  atom.plot_results(metric=\"r2\")\n
                                                                                                                                                                                  atom.plot_results(metric=\"r2\")"}, {"location": "examples/multi_metric/#example-multi-metric-runs", "title": "Example: Multi-metric runs\u00b6", "text": "

                                                                                                                                                                                  This example shows how to evaluate an atom's pipeline on multiple metrics.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/multi_metric/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multi_metric/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multiclass_classification/", "title": "Multiclass classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_wine\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX, y = load_wine(return_X_y=True, as_frame=True)\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look X.head() Out[2]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline 0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)\n\n# Fit the pipeline with the selected models\natom.run(\n    models=[\"LR\",\"LDA\", \"RF\"],\n    metric=\"roc_auc_ovr\",\n    n_trials=14,\n    n_bootstrap=5,\n    errors=\"raise\",\n)\n
                                                                                                                                                                                  atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run( models=[\"LR\",\"LDA\", \"RF\"], metric=\"roc_auc_ovr\", n_trials=14, n_bootstrap=5, errors=\"raise\", )
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\nParallel processing with 16 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (178, 14)\nTrain set size: 143\nTest set size: 35\n-------------------------------------\nMemory: 19.36 kB\nScaled: False\nOutlier values: 12 (0.6%)\n\n\nTraining ========================= >>\nModels: LR, LDA, RF\nMetric: roc_auc_ovr\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |      l1 |  0.0054 |    saga |      480 |      0.7 |         0.5 |              0.5 |    10.567s | 10.567s | COMPLETE |\n| 1     |      l1 |   0.122 |    saga |      380 |      0.7 |      0.9951 |           0.9951 |    11.247s | 21.814s | COMPLETE |\n| 2     |      l2 |  0.0071 |     sag |      720 |      0.3 |         1.0 |              1.0 |    12.060s | 33.874s | COMPLETE |\n| 3     |      l1 | 87.9641 | libli.. |      920 |      0.3 |         1.0 |              1.0 |    10.158s | 44.032s | COMPLETE |\n| 4     |      l2 |  0.0114 |     sag |      630 |      0.7 |         1.0 |              1.0 |     7.990s | 52.022s | COMPLETE |\n| 5     |      l2 |  0.0018 |     sag |      920 |      0.1 |         1.0 |              1.0 |    11.685s | 01m:04s | COMPLETE |\n| 6     |      l2 | 43.4053 |     sag |      780 |      0.3 |         1.0 |              1.0 |     8.361s | 01m:12s | COMPLETE |\n| 7     |      l2 |  2.0759 | libli.. |      470 |      0.2 |         1.0 |              1.0 |     8.213s | 01m:20s | COMPLETE |\n| 8     |    None |   0.043 |     sag |      110 |      1.0 |         1.0 |              1.0 |     7.450s | 01m:28s | COMPLETE |\n| 9     |      l1 | 46.0233 |    saga |      740 |      0.1 |         1.0 |              1.0 |     7.951s | 01m:36s | COMPLETE |\n| 10    |      l2 |  0.4557 |   lbfgs |      280 |      0.5 |         1.0 |              1.0 |     8.807s | 01m:44s | COMPLETE |\n| 11    |      l2 |  0.0013 | libli.. |      940 |      0.4 |         1.0 |              1.0 |     7.970s | 01m:52s | COMPLETE |\n| 12    |      l2 |  4.8717 | newto.. |      780 |      0.3 |         1.0 |              1.0 |     8.202s | 02m:01s | COMPLETE |\n| 13    |      l2 |  0.0324 | libli.. |     1000 |      0.0 |         1.0 |              1.0 |     7.676s | 02m:08s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 2\nBest parameters:\n --> penalty: l2\n --> C: 0.0071\n --> solver: sag\n --> max_iter: 720\n --> l1_ratio: 0.3\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 02m:08s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 0.9991\nTest evaluation --> roc_auc_ovr: 0.9977\nTime elapsed: 0.542s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9984 \u00b1 0.001\nTime elapsed: 0.603s\n-------------------------------------------------\nTime: 02m:09s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |    lsqr |       0.9 |      0.9221 |           0.9221 |     0.048s |  0.048s | COMPLETE |\n| 1     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.027s |  0.074s | COMPLETE |\n| 2     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.001s |  0.075s | COMPLETE |\n| 3     |    lsqr |       0.7 |      0.8638 |           0.9221 |     0.025s |  0.100s | COMPLETE |\n| 4     |   eigen |       0.7 |      0.9019 |           0.9221 |     0.024s |  0.124s | COMPLETE |\n| 5     |    lsqr |      auto |         1.0 |              1.0 |     0.025s |  0.149s | COMPLETE |\n| 6     |   eigen |       1.0 |      0.9121 |              1.0 |     0.000s |  0.149s | COMPLETE |\n| 7     |    lsqr |       1.0 |      0.9445 |              1.0 |     0.026s |  0.175s | COMPLETE |\n| 8     |     svd |      None |         1.0 |              1.0 |     0.025s |  0.200s | COMPLETE |\n| 9     |     svd |      None |         1.0 |              1.0 |     0.001s |  0.201s | COMPLETE |\n| 10    |    lsqr |      auto |         1.0 |              1.0 |     0.002s |  0.203s | COMPLETE |\n| 11    |     svd |      None |         1.0 |              1.0 |     0.002s |  0.205s | COMPLETE |\n| 12    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.206s | COMPLETE |\n| 13    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.207s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 5\nBest parameters:\n --> solver: lsqr\n --> shrinkage: auto\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.207s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 1.0\nTest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.025s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9998 \u00b1 0.0005\nTime elapsed: 0.038s\n-------------------------------------------------\nTime: 0.271s\n\n\nRunning hyperparameter tuning for RandomForest...\n| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |          210 |      gini |        10 |                17 |               20 |          0.5 |     False |        None |       0.0 |      0.9803 |           0.9803 |     0.249s |  0.249s | COMPLETE |\n| 1     |          380 |      gini |         4 |                15 |                3 |          0.9 |     False |        None |      0.01 |      0.9816 |           0.9816 |     0.456s |  0.705s | COMPLETE |\n| 2     |          380 |   entropy |         6 |                 2 |               13 |          0.9 |     False |        None |      0.03 |      0.9944 |           0.9944 |     0.502s |  1.206s | COMPLETE |\n| 3     |          470 |      gini |        11 |                 9 |               18 |          nan |      True |         0.6 |     0.025 |      0.9569 |           0.9944 |     9.106s | 10.312s | COMPLETE |\n| 4     |          100 |   entropy |        12 |                14 |                6 |          0.9 |     False |         nan |     0.035 |         1.0 |              1.0 |     8.530s | 18.842s | COMPLETE |\n| 5     |          470 |   entropy |        13 |                11 |                1 |          nan |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.391s | 20.233s | COMPLETE |\n| 6     |          250 |      gini |        14 |                13 |               17 |          0.7 |      True |         nan |      0.02 |         1.0 |              1.0 |     0.754s | 20.987s | COMPLETE |\n| 7     |          220 |      gini |         5 |                10 |                7 |          0.5 |      True |         0.9 |     0.035 |      0.9981 |              1.0 |     0.712s | 21.699s | COMPLETE |\n| 8     |          130 |   entropy |         4 |                 6 |               11 |          0.9 |     False |         nan |      0.03 |         1.0 |              1.0 |     0.532s | 22.231s | COMPLETE |\n| 9     |          370 |      gini |        12 |                 2 |                4 |          0.5 |     False |         nan |      0.02 |      0.9916 |              1.0 |     0.823s | 23.055s | COMPLETE |\n| 10    |           10 |   entropy |        12 |                20 |                7 |         log2 |     False |         nan |     0.035 |         1.0 |              1.0 |     0.522s | 23.577s | COMPLETE |\n| 11    |           70 |   entropy |        13 |                12 |                1 |         None |      True |         0.5 |      0.01 |      0.9928 |              1.0 |     0.614s | 24.191s | COMPLETE |\n| 12    |          500 |   entropy |         9 |                 7 |                7 |          0.6 |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.139s | 25.330s | COMPLETE |\n| 13    |          140 |   entropy |        16 |                16 |                1 |          0.8 |      True |         0.7 |       0.0 |         1.0 |              1.0 |     0.750s | 26.080s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> n_estimators: 100\n --> criterion: entropy\n --> max_depth: 12\n --> min_samples_split: 14\n --> min_samples_leaf: 6\n --> max_features: 0.9\n --> bootstrap: False\n --> max_samples: None\n --> ccp_alpha: 0.035\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 26.080s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 0.9993\nTest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.737s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9936 \u00b1 0.0067\nTime elapsed: 0.721s\n-------------------------------------------------\nTime: 27.539s\n\n\nFinal results ==================== >>\nTotal time: 02m:40s\n-------------------------------------\nLogisticRegression         --> roc_auc_ovr: 0.9984 \u00b1 0.001\nLinearDiscriminantAnalysis --> roc_auc_ovr: 0.9998 \u00b1 0.0005 !\nRandomForest               --> roc_auc_ovr: 0.9936 \u00b1 0.0067\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                  atom.results Out[4]: roc_auc_ovr_ht time_ht roc_auc_ovr_train roc_auc_ovr_test time_fit roc_auc_ovr_bootstrap time_bootstrap time LR 1.0 128.337325 0.9979 0.9977 0.542487 0.998413 0.602810 129.482622 LDA 1.0 0.207456 1.0000 0.9989 0.025409 0.999773 0.038035 0.270900 RF 1.0 26.080413 0.9951 0.9919 0.737324 0.993613 0.721398 27.539135 In\u00a0[5]: Copied!
                                                                                                                                                                                  # Show the score for some different metrics\natom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"])\n
                                                                                                                                                                                  # Show the score for some different metrics atom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"]) Out[5]: precision_macro recall_macro jaccard_weighted LR 0.9429 0.9484 0.8924 LDA 0.9667 0.9762 0.9457 RF 0.8799 0.8915 0.7968 In\u00a0[10]: Copied!
                                                                                                                                                                                  # Some plots allow you to choose the target class to look at\natom.rf.plot_probabilities(rows=\"train\", target=0)\n
                                                                                                                                                                                  # Some plots allow you to choose the target class to look at atom.rf.plot_probabilities(rows=\"train\", target=0) In\u00a0[8]: Copied!
                                                                                                                                                                                  atom.lda.plot_shap_heatmap(target=2, show=7)\n
                                                                                                                                                                                  atom.lda.plot_shap_heatmap(target=2, show=7)"}, {"location": "examples/multiclass_classification/#example-multiclass-classification", "title": "Example: Multiclass classification\u00b6", "text": "

                                                                                                                                                                                  This example shows how to compare the performance of three models on a multiclass classification task.

                                                                                                                                                                                  Import the wine dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.

                                                                                                                                                                                  "}, {"location": "examples/multiclass_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multilabel_classification/", "title": "Multilabel classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_multilabel_classification\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_multilabel_classification In\u00a0[2]: Copied!
                                                                                                                                                                                  # Create data\nX, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1)\n
                                                                                                                                                                                  # Create data X, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Note that for multioutput tasks, you must specify the `y` keyword\natom = ATOMClassifier(X, y=y, verbose=2, random_state=1)\n
                                                                                                                                                                                  # Note that for multioutput tasks, you must specify the `y` keyword atom = ATOMClassifier(X, y=y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multilabel classification.\n\nDataset stats ==================== >>\nShape: (300, 23)\nTrain set size: 240\nTest set size: 60\n-------------------------------------\nMemory: 51.73 kB\nScaled: False\nOutlier values: 29 (0.5%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Show the models that natively support multilabel tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]]\n
                                                                                                                                                                                  # Show the models that natively support multilabel tasks atom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]] Out[4]: acronym model native_multilabel 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost False 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM False 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM False 18 LR LogisticRegression False 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive False 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent False 28 SVM SupportVectorMachine False 29 XGB XGBoost False In\u00a0[5]: Copied!
                                                                                                                                                                                  atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")\n
                                                                                                                                                                                  atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LDA, RF\nMetric: recall_weighted\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 0.9124\nTest evaluation --> recall_weighted: 0.8351\nTime elapsed: 0.037s\n-------------------------------------------------\nTime: 0.037s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 1.0\nTest evaluation --> recall_weighted: 0.8763\nTime elapsed: 0.170s\n-------------------------------------------------\nTime: 0.170s\n\n\nFinal results ==================== >>\nTotal time: 0.269s\n-------------------------------------\nLinearDiscriminantAnalysis --> recall_weighted: 0.8351\nRandomForest               --> recall_weighted: 0.8763 !\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Note that non-native multioutput models use a meta-estimator wrapper\nprint(f\"Estimator for LDA is: {atom.lda.estimator}\")\nprint(f\"Estimator for RF is: {atom.rf.estimator}\")\n
                                                                                                                                                                                  # Note that non-native multioutput models use a meta-estimator wrapper print(f\"Estimator for LDA is: {atom.lda.estimator}\") print(f\"Estimator for RF is: {atom.rf.estimator}\")
                                                                                                                                                                                  Estimator for LDA is: ClassifierChain(base_estimator=LinearDiscriminantAnalysis(), random_state=1)\nEstimator for RF is: RandomForestClassifier(n_jobs=1, random_state=1)\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  from atom import ATOMModel\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.linear_model import LogisticRegression\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\ncustom_model = ATOMModel(\n    estimator=ClassifierChain(LogisticRegression(), cv=3),\n    name=\"chain\",\n    needs_scaling=True,\n    native_multilabel=True,\n)\n\natom.run(\n    models=custom_model,\n    n_trials=5,\n    ht_params={\n        \"distributions\": {\n            \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),\n            \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),\n            \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),            \n        }\n    },\n)\n
                                                                                                                                                                                  from atom import ATOMModel from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from optuna.distributions import CategoricalDistribution, IntDistribution custom_model = ATOMModel( estimator=ClassifierChain(LogisticRegression(), cv=3), name=\"chain\", needs_scaling=True, native_multilabel=True, ) atom.run( models=custom_model, n_trials=5, ht_params={ \"distributions\": { \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]), \"base_estimator__max_iter\": IntDistribution(100, 200, step=10), \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]), } }, )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: chain\nMetric: recall_weighted\n\n\nRunning hyperparameter tuning for ClassifierChain...\n| trial |     order | base_estimator__max_iter | base_estimator__solver | recall_weighted | best_recall_weighted | time_trial | time_ht |    state |\n| ----- | --------- | ------------------------ | ---------------------- | --------------- | -------------------- | ---------- | ------- | -------- |\n| 0     | [2, 1, 0] |                      130 |                  lbfgs |          0.8831 |               0.8831 |     2.813s |  2.813s | COMPLETE |\n| 1     | [1, 2, 0] |                      150 |              newton-cg |          0.9091 |               0.9091 |     2.184s |  4.997s | COMPLETE |\n| 2     | [2, 1, 0] |                      170 |              newton-cg |          0.8701 |               0.9091 |     0.085s |  5.082s | COMPLETE |\n| 3     | [1, 2, 0] |                      200 |              newton-cg |          0.9221 |               0.9221 |     0.084s |  5.166s | COMPLETE |\n| 4     | [2, 1, 0] |                      100 |              newton-cg |          0.8701 |               0.9221 |     0.078s |  5.244s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 3\nBest parameters:\n --> order: [1, 2, 0]\n --> base_estimator__max_iter: 200\n --> base_estimator__solver: newton-cg\nBest evaluation --> recall_weighted: 0.9221\nTime elapsed: 5.244s\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 0.9021\nTest evaluation --> recall_weighted: 0.866\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 5.345s\n\n\nFinal results ==================== >>\nTotal time: 5.397s\n-------------------------------------\nClassifierChain --> recall_weighted: 0.866\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  thresholds = atom.rf.get_best_threshold()\nprint(f\"Best threshold per target column: {thresholds}\")\n
                                                                                                                                                                                  thresholds = atom.rf.get_best_threshold() print(f\"Best threshold per target column: {thresholds}\")
                                                                                                                                                                                  Best threshold per target column: [0.7, 0.69, 0.63]\n
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  atom.rf.evaluate(threshold=thresholds)\n
                                                                                                                                                                                  atom.rf.evaluate(threshold=thresholds) Out[9]:
                                                                                                                                                                                  accuracy              0.5667\nap                    0.8893\nf1_weighted           0.7274\njaccard_weighted      0.6271\nprecision_weighted    0.8269\nrecall_weighted       0.6495\nauc                   0.9213\nName: RF, dtype: float64
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use\natom.plot_roc(target=2)\n
                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use atom.plot_roc(target=2) In\u00a0[11]: Copied!
                                                                                                                                                                                  # When the target parameter also specifies the class, use format (column, class)\natom.plot_probabilities(models=\"chain\", target=(2, 1))\n
                                                                                                                                                                                  # When the target parameter also specifies the class, use format (column, class) atom.plot_probabilities(models=\"chain\", target=(2, 1)) In\u00a0[12]: Copied!
                                                                                                                                                                                  with atom.canvas(figsize=(900, 600)):\n    atom.plot_calibration(target=0)\n    atom.plot_calibration(target=1)\n
                                                                                                                                                                                  with atom.canvas(figsize=(900, 600)): atom.plot_calibration(target=0) atom.plot_calibration(target=1)"}, {"location": "examples/multilabel_classification/#example-multilabel-classification", "title": "Example: Multilabel classification\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to solve a multilabel classification problem.

                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_multilabel_classification function.

                                                                                                                                                                                  "}, {"location": "examples/multilabel_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#add-custom-multilabel-models", "title": "Add custom multilabel models\u00b6", "text": "

                                                                                                                                                                                  To use your own meta-estimator with custom parameters, add it as a custom model. It's also possible to tune the hyperparameters of this custom meta-estimator.

                                                                                                                                                                                  "}, {"location": "examples/multilabel_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multioutput_regression/", "title": "Multioutput regression", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport numpy as np\nfrom atom import ATOMRegressor, ATOMModel\nfrom sklearn.datasets import make_regression\n\nfrom scikeras.wrappers import KerasRegressor\nfrom keras.models import Sequential\nfrom keras.layers import Dense\n
                                                                                                                                                                                  # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" from tensorflow import get_logger get_logger().setLevel('ERROR') import numpy as np from atom import ATOMRegressor, ATOMModel from sklearn.datasets import make_regression from scikeras.wrappers import KerasRegressor from keras.models import Sequential from keras.layers import Dense In\u00a0[2]: Copied!
                                                                                                                                                                                  # Create data\nX, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3)\n
                                                                                                                                                                                  # Create data X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Create the neural network\nclass NeuralNetwork(KerasRegressor):\n    \"\"\"Multioutput multilayer perceptron.\"\"\"\n\n    @staticmethod\n    def _keras_build_fn(n_inputs, n_outputs, **kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))\n        model.add(Dense(20, activation=\"relu\"))\n        model.add(Dense(n_outputs))\n        model.compile(loss=\"mse\", optimizer=\"adam\")\n        return model\n
                                                                                                                                                                                  # Create the neural network class NeuralNetwork(KerasRegressor): \"\"\"Multioutput multilayer perceptron.\"\"\" @staticmethod def _keras_build_fn(n_inputs, n_outputs, **kwargs): \"\"\"Create the model's architecture.\"\"\" model = Sequential() model.add(Dense(20, input_dim=n_inputs, activation=\"relu\")) model.add(Dense(20, activation=\"relu\")) model.add(Dense(n_outputs)) model.compile(loss=\"mse\", optimizer=\"adam\") return model In\u00a0[4]: Copied!
                                                                                                                                                                                  # Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),\n    name=\"NN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper\n)\n
                                                                                                                                                                                  # Convert the model to an ATOM model model = ATOMModel( estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0), name=\"NN\", needs_scaling=True, # Applies automated feature scaling before fitting native_multioutput=True, # Do not use a multioutput meta-estimator wrapper ) In\u00a0[5]: Copied!
                                                                                                                                                                                  atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multioutput regression.\n\nDataset stats ==================== >>\nShape: (1000, 13)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 104.13 kB\nScaled: True\nOutlier values: 27 (0.3%)\n\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Show the models that natively support multioutput tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]]\n
                                                                                                                                                                                  # Show the models that natively support multioutput tasks atom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]] Out[6]: acronym model native_multioutput 0 AdaB AdaBoost False 1 ARD AutomaticRelevanceDetermination False 2 Bag Bagging False 3 BR BayesianRidge False 4 CatB CatBoost False 5 Tree DecisionTree True 6 Dummy Dummy False 7 EN ElasticNet False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GP GaussianProcess False 11 GBM GradientBoostingMachine False 12 Huber HuberRegression False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 Lasso Lasso False 16 Lars LeastAngleRegression False 17 LGB LightGBM False 18 lSVM LinearSVM False 19 MLP MultiLayerPerceptron False 20 OLS OrdinaryLeastSquares False 21 OMP OrthogonalMatchingPursuit False 22 PA PassiveAggressive False 23 RNN RadiusNearestNeighbors True 24 RF RandomForest True 25 Ridge Ridge False 26 SGD StochasticGradientDescent False 27 SVM SupportVectorMachine False 28 XGB XGBoost False In\u00a0[7]: Copied!
                                                                                                                                                                                  # Note we only added 5 informative features to the dataset, let's remove the rest\n# If we use a model with no native support for multioutput as solver, specify the\n# rfe's importance_getter parameter and return the mean of the coefficients over the\n# target columns\natom.feature_selection(\n    strategy=\"rfe\",\n    solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)\n    n_features=5,\n    importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0),\n)\n
                                                                                                                                                                                  # Note we only added 5 informative features to the dataset, let's remove the rest # If we use a model with no native support for multioutput as solver, specify the # rfe's importance_getter parameter and return the mean of the coefficients over the # target columns atom.feature_selection( strategy=\"rfe\", solver=\"ols\", # This becomes MultiOutputRegressor(OLS) n_features=5, importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0), )
                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 5 features from the dataset.\n   --> Dropping feature x0 (rank 6).\n   --> Dropping feature x5 (rank 5).\n   --> Dropping feature x6 (rank 3).\n   --> Dropping feature x7 (rank 2).\n   --> Dropping feature x9 (rank 4).\n
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # Let's train a native, non-native and our custom model\natom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")\n
                                                                                                                                                                                  # Let's train a native, non-native and our custom model atom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: Lasso, RF, NN\nMetric: mse\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --> mse: -5.1516\nTest evaluation --> mse: -5.5774\nTime elapsed: 0.031s\n-------------------------------------------------\nTime: 0.031s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> mse: -200.7336\nTest evaluation --> mse: -1494.3406\nTime elapsed: 0.706s\n-------------------------------------------------\nTime: 0.706s\n\n\nResults for NeuralNetwork:\nFit ---------------------------------------------\nTrain evaluation --> mse: -111.3789\nTest evaluation --> mse: -105.2649\nTime elapsed: 2.372s\n-------------------------------------------------\nTime: 2.372s\n\n\nFinal results ==================== >>\nTotal time: 3.116s\n-------------------------------------\nLasso         --> mse: -5.5774 !\nRandomForest  --> mse: -1494.3406 ~\nNeuralNetwork --> mse: -105.2649\n
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # And check which of the models used a meta-estimator wrapper\nfor m in atom.models:\n    print(f\"Estimator for {m} is: {atom[m].estimator}\")\n
                                                                                                                                                                                  # And check which of the models used a meta-estimator wrapper for m in atom.models: print(f\"Estimator for {m} is: {atom[m].estimator}\")
                                                                                                                                                                                  Estimator for Lasso is: MultiOutputRegressor(estimator=Lasso(random_state=1), n_jobs=1)\nEstimator for RF is: RandomForestRegressor(n_jobs=1, random_state=1)\nEstimator for NN is: NeuralNetwork(\n\tmodel=None\n\tbuild_fn=None\n\twarm_start=False\n\trandom_state=1\n\toptimizer=rmsprop\n\tloss=None\n\tmetrics=None\n\tbatch_size=None\n\tvalidation_batch_size=None\n\tverbose=0\n\tcallbacks=None\n\tvalidation_split=0.0\n\tshuffle=True\n\trun_eagerly=False\n\tepochs=100\n\tn_inputs=5\n\tn_outputs=3\n\tname=NN\n\tneeds_scaling=True\n\tnative_multioutput=True\n\tnative_multilabel=False\n\thas_validation=None\n)\n
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use\natom.plot_residuals(target=2)\n
                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use atom.plot_residuals(target=2) In\u00a0[11]: Copied!
                                                                                                                                                                                  with atom.canvas(3, 1, figsize=(900, 1300)):\n    atom.plot_errors(target=0)\n    atom.plot_errors(target=1)\n    atom.plot_errors(target=2)\n
                                                                                                                                                                                  with atom.canvas(3, 1, figsize=(900, 1300)): atom.plot_errors(target=0) atom.plot_errors(target=1) atom.plot_errors(target=2)
                                                                                                                                                                                  \n---------------------------------------------------------------------------\nValueError                                Traceback (most recent call last)\nCell In[11], line 2\n      1 with atom.canvas(3, 1, figsize=(900, 1300)):\n----> 2     atom.plot_errors(target=0)\n      3     atom.plot_errors(target=1)\n      4     atom.plot_errors(target=2)\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2712, in crash.<locals>.wrapper(*args, **kwargs)\n   2709     cache[\"last_exception\"] = ex\n   2710     args[0].logger.exception(\"Exception encountered:\")\n-> 2712 raise ex\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2704, in crash.<locals>.wrapper(*args, **kwargs)\n   2701 @wraps(f)\n   2702 def wrapper(*args, **kwargs) -> Any:\n   2703     try:  # Run the function\n-> 2704         return f(*args, **kwargs)\n   2706     except Exception as ex:\n   2707         # If exception is not the same as last, write to log\n   2708         if ex is not cache[\"last_exception\"] and args[0].logger:\n\nFile ~\\Documents\\Python\\ATOM\\atom\\plots\\predictionplot.py:691, in PredictionPlot.plot_errors(self, models, rows, target, title, legend, figsize, filename, display)\n    689         from atom.models import OrdinaryLeastSquares\n    690         model = OrdinaryLeastSquares(goal=self.task.goal, branches=self._branches)\n--> 691         estimator = model._get_est().fit(bk.DataFrame(y_true), y_pred)\n    693         fig.add_trace(\n    694             self._draw_line(\n    695                 x=(x := np.linspace(y_true.min(), y_true.max(), 100)),\n   (...)\n    703             )\n    704         )\n    706 self._draw_straight_line(y=\"diagonal\", xaxis=xaxis, yaxis=yaxis)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)\n   1145     estimator._validate_params()\n   1147 with config_context(\n   1148     skip_parameter_validation=(\n   1149         prefer_skip_nested_validation or global_skip_validation\n   1150     )\n   1151 ):\n-> 1152     return fit_method(estimator, *args, **kwargs)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\multioutput.py:248, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)\n    245     check_classification_targets(y)\n    247 if y.ndim == 1:\n--> 248     raise ValueError(\n    249         \"y must have at least two dimensions for \"\n    250         \"multi-output regression but has only one.\"\n    251     )\n    253 if _routing_enabled():\n    254     routed_params = process_routing(\n    255         obj=self,\n    256         method=\"fit\",\n    257         other_params=fit_params,\n    258         sample_weight=sample_weight,\n    259     )\n\nValueError: y must have at least two dimensions for multi-output regression but has only one.
                                                                                                                                                                                  "}, {"location": "examples/multioutput_regression/#example-multioutput-regression", "title": "Example: Multioutput regression\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to make preditions on a multioutput regression dataset. One of the models used is a MLP regressor implemented with Keras using scikeras.

                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_regression function.

                                                                                                                                                                                  "}, {"location": "examples/multioutput_regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/nlp/", "title": "NLP", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  import numpy as np\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import fetch_20newsgroups\n
                                                                                                                                                                                  import numpy as np from atom import ATOMClassifier from sklearn.datasets import fetch_20newsgroups In\u00a0[2]: Copied!
                                                                                                                                                                                  # Use only a subset of the available topics for faster processing\nX_text, y_text = fetch_20newsgroups(\n    return_X_y=True,\n    categories=[\n        'sci.med',\n        'comp.windows.x',\n        'misc.forsale',\n        'rec.autos',\n    ],\n    shuffle=True,\n    random_state=1,\n)\nX_text = np.array(X_text).reshape(-1, 1)\n
                                                                                                                                                                                  # Use only a subset of the available topics for faster processing X_text, y_text = fetch_20newsgroups( return_X_y=True, categories=[ 'sci.med', 'comp.windows.x', 'misc.forsale', 'rec.autos', ], shuffle=True, random_state=1, ) X_text = np.array(X_text).reshape(-1, 1) In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== >>\nShape: (2366, 2)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 122.87 kB\nScaled: False\nCategorical features: 1 (100.0%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  atom.dataset  # Note that the feature is automatically named 'corpus'\n
                                                                                                                                                                                  atom.dataset # Note that the feature is automatically named 'corpus' Out[4]: corpus target 1731 From: rlm@helen.surfcty.com (Robert L. McMilli... 0 1496 From: carl@SOL1.GPS.CALTECH.EDU (Carl J Lydick... 3 1290 From: thssjxy@iitmax.iit.edu (Smile)\\nSubject:... 1 2021 From: c23st@kocrsv01.delcoelect.com (Spiros Tr... 2 142 From: ginkgo@ecsvax.uncecs.edu (J. Geary Morto... 1 ... ... ... 510 From: mary@uicsl.csl.uiuc.edu (Mary E. Allison... 3 1948 From: ndd@sunbar.mc.duke.edu (Ned Danieley)\\nS... 0 798 From: kk@unisql.UUCP (Kerry Kimbrough)\\nSubjec... 0 2222 From: hamachi@adobe.com (Gordon Hamachi)\\nSubj... 2 2215 From: mobasser@vu-vlsi.ee.vill.edu (Bijan Moba... 2

                                                                                                                                                                                  2366 rows \u00d7 2 columns

                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Let's have a look at the first document\natom.corpus[0]\n
                                                                                                                                                                                  # Let's have a look at the first document atom.corpus[0] Out[5]:
                                                                                                                                                                                  'From: caf@omen.UUCP (Chuck Forsberg WA7KGX)\\nSubject: Re: My New Diet --> IT WORKS GREAT !!!!\\nOrganization: Omen Technology INC, Portland Rain Forest\\nLines: 32\\n\\nIn article <1qk6v3INNrm6@lynx.unm.edu> bhjelle@carina.unm.edu () writes:\\n>\\n>Gordon Banks:\\n>\\n>>a lot to keep from going back to morbid obesity.  I think all\\n>>of us cycle.  One\\'s success depends on how large the fluctuations\\n>>in the cycle are.  Some people can cycle only 5 pounds.  Unfortunately,\\n>>I\\'m not one of them.\\n>>\\n>>\\n>This certainly describes my situation perfectly. For me there is\\n>a constant dynamic between my tendency to eat, which appears to\\n>be totally limitless, and the purely conscious desire to not\\n>put on too much weight. When I get too fat, I just diet/exercise\\n>more (with varying degrees of success) to take off the\\n>extra weight. Usually I cycle within a 15 lb range, but\\n>smaller and larger cycles occur as well. I\\'m always afraid\\n>that this method will stop working someday, but usually\\n>I seem to be able to hold the weight gain in check.\\n>This is one reason I have a hard time accepting the notion\\n>of some metabolic derangement associated with cycle dieting\\n>(that results in long-term weight gain). I have been cycle-\\n>dieting for at least 20 years without seeing such a change.\\n\\nAs mentioned in Adiposity 101, only some experience weight\\nrebound.  The fact that you don\\'t doesn\\'t prove it doesn\\'t\\nhappen to others.\\n-- \\nChuck Forsberg WA7KGX          ...!tektronix!reed!omen!caf \\nAuthor of YMODEM, ZMODEM, Professional-YAM, ZCOMM, and DSZ\\n  Omen Technology Inc    \"The High Reliability Software\"\\n17505-V NW Sauvie IS RD   Portland OR 97231   503-621-3406\\n'
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Clean the documents from noise (emails, numbers, etc...)\natom.textclean()\n
                                                                                                                                                                                  # Clean the documents from noise (emails, numbers, etc...) atom.textclean()
                                                                                                                                                                                  Fitting TextCleaner...\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Check how the first document changed\natom.corpus[0]\n
                                                                                                                                                                                  # Check how the first document changed atom.corpus[0] Out[7]:
                                                                                                                                                                                  'from  chuck forsberg wa7kgx\\nsubject re my new diet  it works great \\norganization omen technology inc portland rain forest\\nlines \\n\\nin article    writes\\n\\ngordon banks\\n\\na lot to keep from going back to morbid obesity  i think all\\nof us cycle  ones success depends on how large the fluctuations\\nin the cycle are  some people can cycle only  pounds  unfortunately\\nim not one of them\\n\\n\\nthis certainly describes my situation perfectly for me there is\\na constant dynamic between my tendency to eat which appears to\\nbe totally limitless and the purely conscious desire to not\\nput on too much weight when i get too fat i just dietexercise\\nmore with varying degrees of success to take off the\\nextra weight usually i cycle within a  lb range but\\nsmaller and larger cycles occur as well im always afraid\\nthat this method will stop working someday but usually\\ni seem to be able to hold the weight gain in check\\nthis is one reason i have a hard time accepting the notion\\nof some metabolic derangement associated with cycle dieting\\nthat results in longterm weight gain i have been cycle\\ndieting for at least  years without seeing such a change\\n\\nas mentioned in adiposity  only some experience weight\\nrebound  the fact that you dont doesnt prove it doesnt\\nhappen to others\\n \\nchuck forsberg wa7kgx          tektronixreedomencaf \\nauthor of ymodem zmodem professionalyam zcomm and dsz\\n  omen technology inc    the high reliability software\\nv nw sauvie is rd   portland or    \\n'
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # Convert the strings to a sequence of words\natom.tokenize()\n
                                                                                                                                                                                  # Convert the strings to a sequence of words atom.tokenize()
                                                                                                                                                                                  Fitting Tokenizer...\nTokenizing the corpus...\n
                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # Print the first few words of the first document\natom.corpus[0][:7]\n
                                                                                                                                                                                  # Print the first few words of the first document atom.corpus[0][:7] Out[9]:
                                                                                                                                                                                  ['from', 'chuck', 'forsberg', 'wa7kgx', 'subject', 're', 'my']
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Normalize the text to a predefined standard\natom.textnormalize(stopwords=\"english\", lemmatize=True)\n
                                                                                                                                                                                  # Normalize the text to a predefined standard atom.textnormalize(stopwords=\"english\", lemmatize=True)
                                                                                                                                                                                  Fitting TextNormalizer...\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  atom.corpus[0][:7]  # Check changes...\n
                                                                                                                                                                                  atom.corpus[0][:7] # Check changes... Out[11]:
                                                                                                                                                                                  ['chuck', 'forsberg', 'wa7kgx', 'subject', 'new', 'diet', 'work']
                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                  # Visualize the most common words with a wordcloud\natom.plot_wordcloud(figsize=(700, 500))\n
                                                                                                                                                                                  # Visualize the most common words with a wordcloud atom.plot_wordcloud(figsize=(700, 500)) In\u00a0[13]: Copied!
                                                                                                                                                                                  # Have a look at the most frequent bigrams\natom.plot_ngrams(2)\n
                                                                                                                                                                                  # Have a look at the most frequent bigrams atom.plot_ngrams(2) In\u00a0[14]: Copied!
                                                                                                                                                                                  # Create the bigrams using the tokenizer\natom.tokenize(bigram_freq=215)\n
                                                                                                                                                                                  # Create the bigrams using the tokenizer atom.tokenize(bigram_freq=215)
                                                                                                                                                                                  Fitting Tokenizer...\nTokenizing the corpus...\n --> Creating 7 bigrams on 3128 locations.\n
                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                  atom.bigrams_\n
                                                                                                                                                                                  atom.bigrams_ Out[15]: bigram frequency 0 x_x 1168 1 line_article 532 2 line_nntppostinghost 389 3 organization_university 331 4 gordon_bank 266 5 distribution_usa 227 6 line_distribution 215 In\u00a0[16]: Copied!
                                                                                                                                                                                  # As a last step before modelling, convert the words to vectors\natom.vectorize(strategy=\"tfidf\")\n
                                                                                                                                                                                  # As a last step before modelling, convert the words to vectors atom.vectorize(strategy=\"tfidf\")
                                                                                                                                                                                  Fitting Vectorizer...\nVectorizing the corpus...\n
                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                  # The dimensionality of the dataset has increased a lot!\natom.shape\n
                                                                                                                                                                                  # The dimensionality of the dataset has increased a lot! atom.shape Out[17]:
                                                                                                                                                                                  (2366, 24176)
                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                  # Note that the data is sparse and the columns are named\n# after the words they are embedding\natom.dtypes\n
                                                                                                                                                                                  # Note that the data is sparse and the columns are named # after the words they are embedding atom.dtypes Out[18]:
                                                                                                                                                                                  corpus_000000e5    Sparse[float64, 0]\ncorpus_00000ee5    Sparse[float64, 0]\ncorpus_000010af    Sparse[float64, 0]\ncorpus_0007259d    Sparse[float64, 0]\ncorpus_00072a27    Sparse[float64, 0]\n                          ...        \ncorpus_zurich      Sparse[float64, 0]\ncorpus_zvi         Sparse[float64, 0]\ncorpus_zx          Sparse[float64, 0]\ncorpus_zz          Sparse[float64, 0]\ntarget                          int64\nLength: 24176, dtype: object
                                                                                                                                                                                  In\u00a0[19]: Copied!
                                                                                                                                                                                  # When the dataset is sparse, stats() shows the density\natom.stats()\n
                                                                                                                                                                                  # When the dataset is sparse, stats() shows the density atom.stats()
                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (2366, 24176)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 2.54 MB\nSparse: True\nDensity: 0.35%\n
                                                                                                                                                                                  In\u00a0[20]: Copied!
                                                                                                                                                                                  # Check which models have support for sparse matrices\natom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]]\n
                                                                                                                                                                                  # Check which models have support for sparse matrices atom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]] Out[20]: acronym model accepts_sparse 0 AdaB AdaBoost True 1 Bag Bagging True 2 BNB BernoulliNB True 3 CatB CatBoost True 4 CatNB CategoricalNB True 5 CNB ComplementNB True 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine True 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB True 21 PA PassiveAggressive True 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[21]: Copied!
                                                                                                                                                                                  # Train the model\natom.run(models=\"RF\", metric=\"f1_weighted\")\n
                                                                                                                                                                                  # Train the model atom.run(models=\"RF\", metric=\"f1_weighted\")
                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF\nMetric: f1_weighted\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1_weighted: 1.0\nTest evaluation --> f1_weighted: 0.9181\nTime elapsed: 02m:24s\n-------------------------------------------------\nTime: 02m:24s\n\n\nFinal results ==================== >>\nTotal time: 02m:24s\n-------------------------------------\nRandomForest --> f1_weighted: 0.9181\n
                                                                                                                                                                                  In\u00a0[22]: Copied!
                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                  atom.evaluate() Out[22]: ba f1_weighted jaccard_weighted mcc precision_weighted recall_weighted RF 0.9183 0.9181 0.8486 0.8918 0.9206 0.9182 In\u00a0[23]: Copied!
                                                                                                                                                                                  atom.plot_confusion_matrix(figsize=(700, 600))\n
                                                                                                                                                                                  atom.plot_confusion_matrix(figsize=(700, 600)) In\u00a0[24]: Copied!
                                                                                                                                                                                  atom.plot_shap_decision(rows=0, show=15)\n
                                                                                                                                                                                  atom.plot_shap_decision(rows=0, show=15) In\u00a0[25]: Copied!
                                                                                                                                                                                  atom.plot_shap_beeswarm(target=0, show=15)\n
                                                                                                                                                                                  atom.plot_shap_beeswarm(target=0, show=15)
                                                                                                                                                                                  100%|===================| 2827/2836 [02:38<00:00]        
                                                                                                                                                                                  "}, {"location": "examples/nlp/#example-nlp", "title": "Example: NLP\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to quickly go from raw text data to model predictions.

                                                                                                                                                                                  Import the 20 newsgroups text dataset from sklearn.datasets. The dataset comprises around 18000 articles on 20 topics. The goal is to predict the topic of every article.

                                                                                                                                                                                  "}, {"location": "examples/nlp/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/nlp/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/nlp/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/pruning/", "title": "Pruning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.pruners import HyperbandPruner\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from optuna.pruners import HyperbandPruner from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Use ht_params to specify a custom pruner\n# Note that pruned trials show the number of iterations it completed\natom.run(\n    models=\"SGD\",\n    metric=\"f1\",\n    n_trials=25,\n    ht_params={\n        \"distributions\": [\"penalty\", \"max_iter\"],\n        \"pruner\": HyperbandPruner(),\n    }\n)\n
                                                                                                                                                                                  # Use ht_params to specify a custom pruner # Note that pruned trials show the number of iterations it completed atom.run( models=\"SGD\", metric=\"f1\", n_trials=25, ht_params={ \"distributions\": [\"penalty\", \"max_iter\"], \"pruner\": HyperbandPruner(), } )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: SGD\nMetric: f1\n\n\nRunning hyperparameter tuning for StochasticGradientDescent...\n| trial | penalty | max_iter |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |      l1 |      650 |  0.9558 |  0.9558 |     2.801s |  2.801s | COMPLETE |\n| 1     | elast.. |     1050 |  0.9744 |  0.9744 |     4.590s |  7.390s | COMPLETE |\n| 2     | elast.. |      500 |  0.9828 |  0.9828 |     0.033s |  7.423s |   PRUNED |\n| 3     |    None |      700 |  0.9739 |  0.9828 |     2.951s | 10.374s | COMPLETE |\n| 4     |      l1 |     1400 |  0.9735 |  0.9828 |     0.033s | 10.407s |   PRUNED |\n| 5     |    None |     1400 |  0.9735 |  0.9828 |     5.994s | 16.401s | COMPLETE |\n| 6     |      l2 |     1200 |  0.9825 |  0.9828 |     5.246s | 21.647s | COMPLETE |\n| 7     |      l2 |     1250 |  0.9825 |  0.9828 |     5.436s | 27.083s | COMPLETE |\n| 8     |    None |      600 |  0.9828 |  0.9828 |     0.023s | 27.106s |   PRUNED |\n| 9     |      l1 |      600 |  0.9402 |  0.9828 |     0.030s | 27.136s |   PRUNED |\n| 10    |      l2 |      950 |  0.9565 |  0.9828 |     4.118s | 31.254s | COMPLETE |\n| 11    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.259s | COMPLETE |\n| 12    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.264s | COMPLETE |\n| 13    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.269s | COMPLETE |\n| 14    |      l2 |     1500 |  0.9573 |  0.9828 |     0.038s | 31.306s |   PRUNED |\n| 15    |      l2 |      950 |  0.9565 |  0.9828 |     0.005s | 31.311s | COMPLETE |\n| 16    |      l2 |     1100 |  0.9391 |  0.9828 |     0.040s | 31.351s |   PRUNED |\n| 17    |      l2 |      850 |  0.9831 |  0.9831 |     0.030s | 31.381s |   PRUNED |\n| 18    | elast.. |     1300 |   0.931 |  0.9831 |     0.029s | 31.410s |   PRUNED |\n| 19    |      l2 |     1300 |  0.9649 |  0.9831 |     0.067s | 31.478s |   PRUNED |\n| 20    |      l2 |      800 |  0.9661 |  0.9831 |     0.039s | 31.517s |   PRUNED |\n| 21    |      l2 |     1150 |  0.9402 |  0.9831 |     0.032s | 31.548s |   PRUNED |\n| 22    |      l2 |     1300 |  0.9573 |  0.9831 |     0.038s | 31.586s |   PRUNED |\n| 23    |      l2 |     1250 |  0.9825 |  0.9831 |     0.008s | 31.594s | COMPLETE |\n| 24    |      l2 |     1050 |  0.9565 |  0.9831 |     0.070s | 31.665s |   PRUNED |\nHyperparameter tuning ---------------------------\nBest trial --> 6\nBest parameters:\n --> penalty: l2\n --> max_iter: 1200\nBest evaluation --> f1: 0.9825\nTime elapsed: 31.665s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.993\nTest evaluation --> f1: 0.9722\nTime elapsed: 8.384s\n-------------------------------------------------\nTime: 40.049s\n\n\nFinal results ==================== >>\nTotal time: 40.301s\n-------------------------------------\nStochasticGradientDescent --> f1: 0.9722\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  atom.plot_trials()\n
                                                                                                                                                                                  atom.plot_trials() In\u00a0[6]: Copied!
                                                                                                                                                                                  atom.plot_hyperparameter_importance()\n
                                                                                                                                                                                  atom.plot_hyperparameter_importance()"}, {"location": "examples/pruning/#example-pruning", "title": "Example: Pruning\u00b6", "text": "

                                                                                                                                                                                  This example shows an advanced example on how to use hyperparameter tuning with pruning.

                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                  "}, {"location": "examples/pruning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/pruning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/pruning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ray_backend/", "title": "Ray backend", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport ray\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n
                                                                                                                                                                                  # Import packages import ray import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied!
                                                                                                                                                                                  # Use a small dataset for illustration purposes\nX, y = make_classification(n_samples=10000, n_features=10, random_state=1)\n
                                                                                                                                                                                  # Use a small dataset for illustration purposes X, y = make_classification(n_samples=10000, n_features=10, random_state=1) In\u00a0[3]: Copied!
                                                                                                                                                                                  # Note we already specify the number of cores for parallel execution here\natom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)\n
                                                                                                                                                                                  # Note we already specify the number of cores for parallel execution here atom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)
                                                                                                                                                                                  2023-11-04 23:01:00,897\tINFO worker.py:1664 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265 \n
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 2 cores.\nParallelization backend: ray\n\nDataset stats ==================== >>\nShape: (10000, 11)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 880.13 kB\nScaled: True\nOutlier values: 211 (0.2%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # The ray backend uses modin instead of pandas as data handler\ntype(atom.dataset)\n
                                                                                                                                                                                  # The ray backend uses modin instead of pandas as data handler type(atom.dataset) Out[4]:
                                                                                                                                                                                  pandas.core.frame.DataFrame
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Use data cleaning as usual\natom.scale()\n
                                                                                                                                                                                  # Use data cleaning as usual atom.scale()
                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Using `parallel=True`, we train one model in each node\n# Note that when training in parallel, the verbosity of the models is zero\natom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)\n
                                                                                                                                                                                  # Using `parallel=True`, we train one model in each node # Note that when training in parallel, the verbosity of the models is zero atom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)
                                                                                                                                                                                  \nTraining ========================= >>\nModels: PA, SGD\nMetric: f1\n\n\nFinal results ==================== >>\nTotal time: 9.407s\n-------------------------------------\nPassiveAggressive         --> f1: 0.8165\nStochasticGradientDescent --> f1: 0.8774 !\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Notice how the summed time to train the models is less than the total time\natom.plot_results(metric=\"time_fit\")\n
                                                                                                                                                                                  # Notice how the summed time to train the models is less than the total time atom.plot_results(metric=\"time_fit\") In\u00a0[8]: Copied!
                                                                                                                                                                                  # Create a rest API endpoint and do inference on the holdout set\natom.pa.serve(port=8001)\n
                                                                                                                                                                                  # Create a rest API endpoint and do inference on the holdout set atom.pa.serve(port=8001) In\u00a0[9]: Copied!
                                                                                                                                                                                  import requests\n\nX_predict = atom.X_test.iloc[:10, :]\nresponse = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json())\n
                                                                                                                                                                                  import requests X_predict = atom.X_test.iloc[:10, :] response = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json()) In\u00a0[10]: Copied!
                                                                                                                                                                                  response.json()\n
                                                                                                                                                                                  response.json() Out[10]:
                                                                                                                                                                                  [1, 1, 0, 0, 1, 1, 0, 1, 0, 0]
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # Don't forget to shut down the ray server\nray.shutdown()\n
                                                                                                                                                                                  # Don't forget to shut down the ray server ray.shutdown()"}, {"location": "examples/ray_backend/#example-ray-backend", "title": "Example: Ray backend\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use the ray backend to train models in a parallel context.

                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_classification function.

                                                                                                                                                                                  "}, {"location": "examples/ray_backend/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ray_backend/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ray_backend/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/regression/", "title": "Regression", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\") # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom for regression tasks\natom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)\n
                                                                                                                                                                                  # Initialize atom for regression tasks atom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 195 (0.6%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Encode the categorical features\natom.encode()\n
                                                                                                                                                                                  # Encode the categorical features atom.encode()
                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> OneHot-encoding feature Sex. Contains 3 classes.\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Plot the dataset's correlation matrix\natom.plot_correlation()\n
                                                                                                                                                                                  # Plot the dataset's correlation matrix atom.plot_correlation() In\u00a0[6]: Copied!
                                                                                                                                                                                  # Apply pca for dimensionality reduction\natom.feature_selection(strategy=\"pca\", n_features=6)\n
                                                                                                                                                                                  # Apply pca for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6)
                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 6 components.\n   --> Explained variance ratio: 0.97\n
                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                  # Note that the fetaures are automatically renamed to pca0, pca1, etc...\natom.columns\n
                                                                                                                                                                                  # Note that the fetaures are automatically renamed to pca0, pca1, etc... atom.columns Out[7]:
                                                                                                                                                                                  Index(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'Rings'], dtype='object')
                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                  # Use the plotting methods to see the retained variance ratio\natom.plot_pca()\n
                                                                                                                                                                                  # Use the plotting methods to see the retained variance ratio atom.plot_pca() In\u00a0[9]: Copied!
                                                                                                                                                                                  atom.plot_components()\n
                                                                                                                                                                                  atom.plot_components() In\u00a0[10]: Copied!
                                                                                                                                                                                  atom.run(\n    models=[\"Tree\", \"Bag\", \"ET\"],\n    metric=\"mse\",\n    n_trials=5,\n    n_bootstrap=5,\n)\n
                                                                                                                                                                                  atom.run( models=[\"Tree\", \"Bag\", \"ET\"], metric=\"mse\", n_trials=5, n_bootstrap=5, )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: Tree, Bag, ET\nMetric: mse\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial |   criterion | splitter | max_depth | min_samples_split | min_samples_leaf | max_features | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ----------- | -------- | --------- | ----------------- | ---------------- | ------------ | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     | absolute_.. |     best |         5 |                 8 |               10 |         None |     0.035 | -6.5456 |  -6.5456 |     0.255s |  0.255s | COMPLETE |\n| 1     | squared_e.. |     best |        10 |                 5 |                1 |          0.5 |      0.03 | -7.1959 |  -6.5456 |     0.065s |  0.320s | COMPLETE |\n| 2     | absolute_.. |   random |        14 |                15 |               16 |         sqrt |     0.025 | -8.5859 |  -6.5456 |     0.067s |  0.387s | COMPLETE |\n| 3     | friedman_.. |   random |         4 |                10 |               17 |          0.9 |      0.01 | -7.4933 |  -6.5456 |     0.052s |  0.439s | COMPLETE |\n| 4     |     poisson |     best |        12 |                15 |                8 |          0.6 |      0.02 | -5.8126 |  -5.8126 |     0.066s |  0.505s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> criterion: poisson\n --> splitter: best\n --> max_depth: 12\n --> min_samples_split: 15\n --> min_samples_leaf: 8\n --> max_features: 0.6\n --> ccp_alpha: 0.02\nBest evaluation --> mse: -5.8126\nTime elapsed: 0.505s\nFit ---------------------------------------------\nTrain evaluation --> mse: -6.2977\nTest evaluation --> mse: -7.1923\nTime elapsed: 0.045s\nBootstrap ---------------------------------------\nEvaluation --> mse: -7.6026 \u00b1 0.3783\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.660s\n\n\nRunning hyperparameter tuning for Bagging...\n| trial | n_estimators | max_samples | max_features | bootstrap | bootstrap_features |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ----------- | ------------ | --------- | ------------------ | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 |         1.0 |          0.9 |      True |               True | -4.5751 |  -4.5751 |     5.791s |  5.791s | COMPLETE |\n\nException encountered while running the Bag model.\nMemoryError: could not allocate 187712 bytes\n\n\nRunning hyperparameter tuning for ExtraTrees...\n| trial | n_estimators |     criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 | squared_error |         8 |                13 |                3 |          0.5 |      True |         0.6 |     0.025 | -5.1462 |  -5.1462 |     0.285s |  0.285s | COMPLETE |\n| 1     |          230 | absolute_er.. |         8 |                 8 |                8 |         sqrt |      True |         0.6 |       0.0 | -9.3444 |  -5.1462 |     1.377s |  1.662s | COMPLETE |\n| 2     |          180 | absolute_er.. |         7 |                 2 |                3 |          0.6 |      True |         0.6 |      0.03 | -5.7371 |  -5.1462 |     1.738s |  3.400s | COMPLETE |\n| 3     |          100 | squared_error |        14 |                15 |                8 |         None |      True |         0.9 |     0.005 | -5.1938 |  -5.1462 |     0.231s |  3.631s | COMPLETE |\n| 4     |          340 | squared_error |         6 |                15 |                8 |         None |      True |         0.8 |      0.01 | -4.8716 |  -4.8716 |     0.457s |  4.088s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> n_estimators: 340\n --> criterion: squared_error\n --> max_depth: 6\n --> min_samples_split: 15\n --> min_samples_leaf: 8\n --> max_features: None\n --> bootstrap: True\n --> max_samples: 0.8\n --> ccp_alpha: 0.01\nBest evaluation --> mse: -4.8716\nTime elapsed: 4.088s\nFit ---------------------------------------------\nTrain evaluation --> mse: -5.4808\nTest evaluation --> mse: -6.3445\nTime elapsed: 0.535s\nBootstrap ---------------------------------------\nEvaluation --> mse: -6.3694 \u00b1 0.0737\nTime elapsed: 2.245s\n-------------------------------------------------\nTime: 6.868s\n\n\nFinal results ==================== >>\nTotal time: 32.361s\n-------------------------------------\nDecisionTree --> mse: -7.6026 \u00b1 0.3783\nExtraTrees   --> mse: -6.3694 \u00b1 0.0737 !\n
                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                  # Use the errors or residuals plots to check the model performances\natom.plot_residuals()\n
                                                                                                                                                                                  # Use the errors or residuals plots to check the model performances atom.plot_residuals() In\u00a0[12]: Copied!
                                                                                                                                                                                  atom.plot_errors()\n
                                                                                                                                                                                  atom.plot_errors() In\u00a0[13]: Copied!
                                                                                                                                                                                  # Analyze the relation between the target response and the features\natom.plot_partial_dependence(columns=(0, 1, 2, 3))\n
                                                                                                                                                                                  # Analyze the relation between the target response and the features atom.plot_partial_dependence(columns=(0, 1, 2, 3))"}, {"location": "examples/regression/#example-regression", "title": "Example: Regression\u00b6", "text": "

                                                                                                                                                                                  This example shows how to use ATOM to apply pca on the data and run a regression pipeline.

                                                                                                                                                                                  Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone. The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.

                                                                                                                                                                                  "}, {"location": "examples/regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/successive_halving/", "title": "Successive halving", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  from sklearn.datasets import fetch_california_housing\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                  from sklearn.datasets import fetch_california_housing from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX, y = fetch_california_housing(return_X_y=True)\n
                                                                                                                                                                                  # Load the data X, y = fetch_california_housing(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMRegressor(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                  atom = ATOMRegressor(X, y, verbose=2, random_state=1)
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (20640, 9)\nTrain set size: 16512\nTest set size: 4128\n-------------------------------------\nMemory: 1.49 MB\nScaled: False\nOutlier values: 786 (0.5%)\n\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Compare tree-based models via successive halving\natom.successive_halving(\n    models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],\n    metric=\"mae\",\n    n_bootstrap=5,\n)\n
                                                                                                                                                                                  # Compare tree-based models via successive halving atom.successive_halving( models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"], metric=\"mae\", n_bootstrap=5, )
                                                                                                                                                                                  \nTraining ========================= >>\nMetric: mae\n\n\nRun: 0 =========================== >>\nModels: Tree6, Bag6, ET6, RF6, LGB6, CatB6\nSize of training set: 16512 (17%)\nSize of test set: 4128\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.5394\nTime elapsed: 0.103s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.576 \u00b1 0.0119\nTime elapsed: 0.422s\n-------------------------------------------------\nTime: 0.525s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1715\nTest evaluation --> mae: -0.4308\nTime elapsed: 0.450s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.435 \u00b1 0.0059\nTime elapsed: 2.061s\n-------------------------------------------------\nTime: 2.511s\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.3977\nTime elapsed: 1.574s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.4059 \u00b1 0.0028\nTime elapsed: 7.107s\n-------------------------------------------------\nTime: 8.681s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1508\nTest evaluation --> mae: -0.4053\nTime elapsed: 4.178s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.4162 \u00b1 0.0031\nTime elapsed: 18.156s\n-------------------------------------------------\nTime: 22.335s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2031\nTest evaluation --> mae: -0.3594\nTime elapsed: 0.438s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3673 \u00b1 0.0016\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 1.324s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1621\nTest evaluation --> mae: -0.3483\nTime elapsed: 5.084s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3554 \u00b1 0.0025\nTime elapsed: 20.177s\n-------------------------------------------------\nTime: 25.261s\n\n\nFinal results ==================== >>\nTotal time: 01m:01s\n-------------------------------------\nDecisionTree --> mae: -0.576 \u00b1 0.0119 ~\nBagging      --> mae: -0.435 \u00b1 0.0059 ~\nExtraTrees   --> mae: -0.4059 \u00b1 0.0028 ~\nRandomForest --> mae: -0.4162 \u00b1 0.0031 ~\nLightGBM     --> mae: -0.3673 \u00b1 0.0016 ~\nCatBoost     --> mae: -0.3554 \u00b1 0.0025 ~ !\n\n\nRun: 1 =========================== >>\nModels: ET3, LGB3, CatB3\nSize of training set: 16512 (33%)\nSize of test set: 4128\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.3739\nTime elapsed: 2.738s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3841 \u00b1 0.0027\nTime elapsed: 11.259s\n-------------------------------------------------\nTime: 13.997s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2327\nTest evaluation --> mae: -0.3356\nTime elapsed: 0.389s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.345 \u00b1 0.0037\nTime elapsed: 0.876s\n-------------------------------------------------\nTime: 1.265s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1882\nTest evaluation --> mae: -0.3255\nTime elapsed: 4.800s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3352 \u00b1 0.0023\nTime elapsed: 22.708s\n-------------------------------------------------\nTime: 27.509s\n\n\nFinal results ==================== >>\nTotal time: 43.130s\n-------------------------------------\nExtraTrees --> mae: -0.3841 \u00b1 0.0027 ~\nLightGBM   --> mae: -0.345 \u00b1 0.0037 ~\nCatBoost   --> mae: -0.3352 \u00b1 0.0023 ~ !\n\n\nRun: 2 =========================== >>\nModels: CatB1\nSize of training set: 16512 (100%)\nSize of test set: 4128\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2229\nTest evaluation --> mae: -0.2986\nTime elapsed: 6.851s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3091 \u00b1 0.0026\nTime elapsed: 33.428s\n-------------------------------------------------\nTime: 40.279s\n\n\nFinal results ==================== >>\nTotal time: 40.375s\n-------------------------------------\nCatBoost --> mae: -0.3091 \u00b1 0.0026 ~\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # The results is now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the number of models fitted during that run\natom.results\n
                                                                                                                                                                                  # The results is now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the number of models fitted during that run atom.results Out[5]: mae_train mae_test time_fit mae_bootstrap time_bootstrap time frac model 0.17 Bag6 -0.2017 -0.4327 0.450035 -0.434981 2.061373 2.511408 CatB6 -0.2065 -0.3557 5.083625 -0.355352 20.176994 25.260619 ET6 -0.0694 -0.4077 1.574000 -0.405855 7.106890 8.680890 LGB6 -0.2202 -0.3676 0.438399 -0.367271 0.885806 1.324205 RF6 -0.1851 -0.4165 4.178345 -0.416217 18.156310 22.334655 Tree6 -0.1039 -0.5897 0.102987 -0.575962 0.422224 0.525211 0.33 CatB3 -0.2249 -0.3384 4.800246 -0.335246 22.708465 27.508711 ET3 -0.0935 -0.3879 2.738315 -0.384081 11.258794 13.997109 LGB3 -0.2489 -0.3405 0.389353 -0.344951 0.875797 1.265150 1.00 CatB1 -0.2447 -0.3066 6.851350 -0.309112 33.428059 40.279409 In\u00a0[6]: Copied!
                                                                                                                                                                                  # Plot the successive halving's results\natom.plot_successive_halving()\n
                                                                                                                                                                                  # Plot the successive halving's results atom.plot_successive_halving() In\u00a0[7]: Copied!
                                                                                                                                                                                  # Use regex to call all the models with the same estimator...\natom.plot_errors(models=[\"CatB.*\"])\n
                                                                                                                                                                                  # Use regex to call all the models with the same estimator... atom.plot_errors(models=[\"CatB.*\"]) In\u00a0[8]: Copied!
                                                                                                                                                                                  # ...or to call the models from the same run\natom.plot_errors(models=\".*3\")\n
                                                                                                                                                                                  # ...or to call the models from the same run atom.plot_errors(models=\".*3\")"}, {"location": "examples/successive_halving/#example-successive-halving", "title": "Example: Successive halving\u00b6", "text": "

                                                                                                                                                                                  This example shows how to compare multiple tree-based models using successive halving.

                                                                                                                                                                                  Import the california housing dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict house prices.

                                                                                                                                                                                  "}, {"location": "examples/successive_halving/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/successive_halving/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/successive_halving/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/train_sizing/", "title": "Train sizing", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  # Initialize atom and prepare the data\natom = ATOMClassifier(X, verbose=2, random_state=1)\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8)\natom.encode()\n
                                                                                                                                                                                  # Initialize atom and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8) atom.encode()
                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\n --> Dropping 161 samples for containing more than 16 missing values.\n --> Imputing 481 missing values with median (12.0) in feature MinTemp.\n --> Imputing 265 missing values with median (22.6) in feature MaxTemp.\n --> Imputing 1354 missing values with median (0.0) in feature Rainfall.\n --> Imputing 60682 missing values with median (4.8) in feature Evaporation.\n --> Imputing 67659 missing values with median (8.4) in feature Sunshine.\n --> Imputing 9187 missing values with most_frequent (W) in feature WindGustDir.\n --> Imputing 9127 missing values with median (39.0) in feature WindGustSpeed.\n --> Imputing 9852 missing values with most_frequent (N) in feature WindDir9am.\n --> Imputing 3617 missing values with most_frequent (SE) in feature WindDir3pm.\n --> Imputing 1187 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 2469 missing values with median (19.0) in feature WindSpeed3pm.\n --> Imputing 1613 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 3449 missing values with median (52.0) in feature Humidity3pm.\n --> Imputing 13863 missing values with median (1017.6) in feature Pressure9am.\n --> Imputing 13830 missing values with median (1015.2) in feature Pressure3pm.\n --> Imputing 53496 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 56933 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 743 missing values with median (16.7) in feature Temp9am.\n --> Imputing 2565 missing values with median (21.1) in feature Temp3pm.\n --> Imputing 1354 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 49 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Analyze the impact of the training set's size on a LR model\natom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)\n
                                                                                                                                                                                  # Analyze the impact of the training set's size on a LR model atom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)
                                                                                                                                                                                  \nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR01\nSize of training set: 11362 (10%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5624\nTest evaluation --> f1: 0.5857\nTime elapsed: 0.721s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0021\nTime elapsed: 0.729s\n-------------------------------------------------\nTime: 1.449s\n\n\nFinal results ==================== >>\nTotal time: 2.053s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0021\n\n\nRun: 1 =========================== >>\nModels: LR02\nSize of training set: 22724 (20%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.582\nTest evaluation --> f1: 0.5874\nTime elapsed: 0.853s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5851 \u00b1 0.002\nTime elapsed: 0.865s\n-------------------------------------------------\nTime: 1.718s\n\n\nFinal results ==================== >>\nTotal time: 2.425s\n-------------------------------------\nLogisticRegression --> f1: 0.5851 \u00b1 0.002\n\n\nRun: 2 =========================== >>\nModels: LR03\nSize of training set: 34087 (30%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5812\nTest evaluation --> f1: 0.585\nTime elapsed: 1.086s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5861 \u00b1 0.0009\nTime elapsed: 1.119s\n-------------------------------------------------\nTime: 2.205s\n\n\nFinal results ==================== >>\nTotal time: 3.035s\n-------------------------------------\nLogisticRegression --> f1: 0.5861 \u00b1 0.0009\n\n\nRun: 3 =========================== >>\nModels: LR04\nSize of training set: 45449 (40%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5828\nTest evaluation --> f1: 0.5862\nTime elapsed: 1.173s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5863 \u00b1 0.0017\nTime elapsed: 1.282s\n-------------------------------------------------\nTime: 2.455s\n\n\nFinal results ==================== >>\nTotal time: 3.365s\n-------------------------------------\nLogisticRegression --> f1: 0.5863 \u00b1 0.0017\n\n\nRun: 4 =========================== >>\nModels: LR05\nSize of training set: 56812 (50%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5823\nTest evaluation --> f1: 0.5853\nTime elapsed: 1.264s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.460s\n-------------------------------------------------\nTime: 2.724s\n\n\nFinal results ==================== >>\nTotal time: 3.758s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0016\n\n\nRun: 5 =========================== >>\nModels: LR06\nSize of training set: 68174 (60%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5835\nTest evaluation --> f1: 0.5843\nTime elapsed: 1.392s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.704s\n-------------------------------------------------\nTime: 3.095s\n\n\nFinal results ==================== >>\nTotal time: 4.151s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0016\n\n\nRun: 6 =========================== >>\nModels: LR07\nSize of training set: 79536 (70%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5872\nTest evaluation --> f1: 0.5846\nTime elapsed: 1.585s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5852 \u00b1 0.0013\nTime elapsed: 1.836s\n-------------------------------------------------\nTime: 3.421s\n\n\nFinal results ==================== >>\nTotal time: 4.664s\n-------------------------------------\nLogisticRegression --> f1: 0.5852 \u00b1 0.0013\n\n\nRun: 7 =========================== >>\nModels: LR08\nSize of training set: 90899 (80%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5889\nTest evaluation --> f1: 0.5841\nTime elapsed: 1.693s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5852 \u00b1 0.0025\nTime elapsed: 2.139s\n-------------------------------------------------\nTime: 3.832s\n\n\nFinal results ==================== >>\nTotal time: 5.157s\n-------------------------------------\nLogisticRegression --> f1: 0.5852 \u00b1 0.0025\n\n\nRun: 8 =========================== >>\nModels: LR09\nSize of training set: 102261 (90%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5871\nTest evaluation --> f1: 0.5837\nTime elapsed: 1.754s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5844 \u00b1 0.0022\nTime elapsed: 2.353s\n-------------------------------------------------\nTime: 4.107s\n\n\nFinal results ==================== >>\nTotal time: 5.464s\n-------------------------------------\nLogisticRegression --> f1: 0.5844 \u00b1 0.0022\n\n\nRun: 9 =========================== >>\nModels: LR10\nSize of training set: 113624 (100%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5856\nTest evaluation --> f1: 0.585\nTime elapsed: 1.978s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5846 \u00b1 0.0005\nTime elapsed: 2.544s\n-------------------------------------------------\nTime: 4.521s\n\n\nFinal results ==================== >>\nTotal time: 5.975s\n-------------------------------------\nLogisticRegression --> f1: 0.5846 \u00b1 0.0005\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # The results are now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the fraction as well (without the dot)\natom.results\n
                                                                                                                                                                                  # The results are now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the fraction as well (without the dot) atom.results Out[5]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time frac model 0.1 LR01 0.5622 0.5852 0.720655 0.585044 0.728664 1.449319 0.2 LR02 0.5830 0.5845 0.852776 0.585144 0.864794 1.717570 0.3 LR03 0.5795 0.5856 1.085709 0.586101 1.119410 2.205119 0.4 LR04 0.5847 0.5858 1.173066 0.586305 1.282166 2.455232 0.5 LR05 0.5836 0.5862 1.264150 0.585003 1.460329 2.724479 0.6 LR06 0.5832 0.5833 1.391943 0.584966 1.703550 3.095493 0.7 LR07 0.5880 0.5856 1.585444 0.585199 1.835532 3.420976 0.8 LR08 0.5914 0.5882 1.693054 0.585235 2.138652 3.831706 0.9 LR09 0.5854 0.5828 1.753595 0.584420 2.353141 4.106736 1.0 LR10 0.5862 0.5850 1.977799 0.584634 2.543574 4.521373 In\u00a0[6]: Copied!
                                                                                                                                                                                  # Every model can be accessed through its name\natom.lr05.plot_shap_waterfall(show=6)\n
                                                                                                                                                                                  # Every model can be accessed through its name atom.lr05.plot_shap_waterfall(show=6) In\u00a0[7]: Copied!
                                                                                                                                                                                  # Plot the train sizing's results\natom.plot_learning_curve()\n
                                                                                                                                                                                  # Plot the train sizing's results atom.plot_learning_curve()"}, {"location": "examples/train_sizing/#example-train-sizing", "title": "Example: Train sizing\u00b6", "text": "

                                                                                                                                                                                  This example shows how to asses a model's performance based on the size of the training set.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/train_sizing/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/train_sizing/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/train_sizing/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/utilities/", "title": "Utilities", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                  # Import packages\nimport tempfile\nimport pandas as pd\nfrom sklearn.metrics import fbeta_score\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                  # Import packages import tempfile import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                  atom = ATOMClassifier(X, random_state=1)\natom.clean()\n\n# Quickly check what columns have missing values\nprint(f\"Columns with missing values:\\n{atom.nans}\")\n\n# Or what columns are categorical\nprint(f\"\\nCategorical columns: {atom.categorical}\")\n\n# Or if the dataset is scaled\nprint(f\"\\nIs the dataset scaled? {atom.scaled}\")\n
                                                                                                                                                                                  atom = ATOMClassifier(X, random_state=1) atom.clean() # Quickly check what columns have missing values print(f\"Columns with missing values:\\n{atom.nans}\") # Or what columns are categorical print(f\"\\nCategorical columns: {atom.categorical}\") # Or if the dataset is scaled print(f\"\\nIs the dataset scaled? {atom.scaled}\")
                                                                                                                                                                                  Columns with missing values:\nLocation             0\nMinTemp            637\nMaxTemp            322\nRainfall          1406\nEvaporation      60843\nSunshine         67816\nWindGustDir       9330\nWindGustSpeed     9270\nWindDir9am       10013\nWindDir3pm        3778\nWindSpeed9am      1348\nWindSpeed3pm      2630\nHumidity9am       1774\nHumidity3pm       3610\nPressure9am      14014\nPressure3pm      13981\nCloud9am         53657\nCloud3pm         57094\nTemp9am            904\nTemp3pm           2726\nRainToday         1406\ndtype: int64\n\nCategorical columns: Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], dtype='object')\n\nIs the dataset scaled? False\n
                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                  # Note the number of missing values and categorical columns\natom.stats()\n
                                                                                                                                                                                  # Note the number of missing values and categorical columns atom.stats()
                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 27.44 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n
                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                  # Now, let's impute and encode the dataset...\natom.impute()\natom.encode()\n\n# ... and the values are gone\natom.stats()\n
                                                                                                                                                                                  # Now, let's impute and encode the dataset... atom.impute() atom.encode() # ... and the values are gone atom.stats()
                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (56420, 22)\nTrain set size: 45075\nTest set size: 11345\n-------------------------------------\nMemory: 11.11 MB\nScaled: False\nOutlier values: 3203 (0.3%)\n
                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                  # Compare the relationship of multiple columns with a scatter maxtrix\natom.plot_relationships(columns=slice(0, 5))\n
                                                                                                                                                                                  # Compare the relationship of multiple columns with a scatter maxtrix atom.plot_relationships(columns=slice(0, 5)) In\u00a0[7]: Copied!
                                                                                                                                                                                  # Check which distribution fits a column best\natom.distribution(columns=\"Rainfall\")\n
                                                                                                                                                                                  # Check which distribution fits a column best atom.distribution(columns=\"Rainfall\") Out[7]: Rainfall dist stat beta score 0.6506 p_value 0.0 expon score 0.6506 p_value 0.0 gamma score 0.6465 p_value 0.0 invgauss score 0.6257 p_value 0.0 lognorm score 0.6485 p_value 0.0 norm score 0.3807 p_value 0.0 pearson3 score 0.6506 p_value 0.0 triang score 0.7191 p_value 0.0 uniform score 0.8914 p_value 0.0 weibull_min score 0.6506 p_value 0.0 weibull_max score 0.8896 p_value 0.0 In\u00a0[8]: Copied!
                                                                                                                                                                                  # Investigate a column's distribution\natom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\")\natom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")\n
                                                                                                                                                                                  # Investigate a column's distribution atom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\") atom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")

                                                                                                                                                                                  There are two ways to quickly transform the dataset mid-pipeline. The first way is through the property's @setter. The downside for this approach is that the transformation is not stored in atom's pipeline, so the transformation is not applied on new data. Therefore, we recommend using the second approach, through the add method.

                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                  # Note that we can only replace a dataframe with a new dataframe!\natom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)\n\n# This will automatically update all other data attributes\nassert \"AvgTemp\" in atom\n\n# But it's not saved to atom's pipeline\natom.pipeline\n
                                                                                                                                                                                  # Note that we can only replace a dataframe with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2) # This will automatically update all other data attributes assert \"AvgTemp\" in atom # But it's not saved to atom's pipeline atom.pipeline Out[9]:
                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])
                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline
                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])
                                                                                                                                                                                  Cleaner
                                                                                                                                                                                  Cleaner()
                                                                                                                                                                                  Imputer
                                                                                                                                                                                  Imputer()
                                                                                                                                                                                  Encoder
                                                                                                                                                                                  Encoder(value='rare')
                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                  # Same transformation, different approach (AvgTemp is overwritten)\ndef transform(df):\n    df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2\n    return df\n\natom.apply(transform)\n\nassert \"AvgTemp\" in atom\n
                                                                                                                                                                                  # Same transformation, different approach (AvgTemp is overwritten) def transform(df): df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2 return df atom.apply(transform) assert \"AvgTemp\" in atom In\u00a0[11]: Copied!
                                                                                                                                                                                  # Now the function appears in the pipeline\natom.pipeline\n
                                                                                                                                                                                  # Now the function appears in the pipeline atom.pipeline Out[11]:
                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=<function transform at 0x0000016745DF6B90>))])
                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline
                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=<function transform at 0x0000016745DF6B90>))])
                                                                                                                                                                                  Cleaner
                                                                                                                                                                                  Cleaner()
                                                                                                                                                                                  Imputer
                                                                                                                                                                                  Imputer()
                                                                                                                                                                                  Encoder
                                                                                                                                                                                  Encoder(value='rare')
                                                                                                                                                                                  FunctionTransformer
                                                                                                                                                                                  FunctionTransformer(func=<function transform at 0x0000016745DF6B90>)
                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                  atom.available_models()\n
                                                                                                                                                                                  atom.available_models() Out[12]: acronym model estimator module needs_scaling accepts_sparse native_multilabel native_multioutput has_validation supports_engines 0 AdaB AdaBoost AdaBoostClassifier sklearnensemble False True False False False sklearn 1 Bag Bagging BaggingClassifier sklearnensemble False True False False False sklearn 2 BNB BernoulliNB BernoulliNB sklearnnaive_bayes False True False False False sklearn, cuml 3 CatB CatBoost CatBoostClassifier catboostcatboost True True False False True catboost 4 CatNB CategoricalNB CategoricalNB sklearnnaive_bayes False True False False False sklearn, cuml 5 CNB ComplementNB ComplementNB sklearnnaive_bayes False True False False False sklearn, cuml 6 Tree DecisionTree DecisionTreeClassifier sklearntree False True True True False sklearn 7 Dummy Dummy DummyClassifier sklearndummy False False False False False sklearn 8 ETree ExtraTree ExtraTreeClassifier sklearntree False True True True False sklearn 9 ET ExtraTrees ExtraTreesClassifier sklearnensemble False True True True False sklearn 10 GNB GaussianNB GaussianNB sklearnnaive_bayes False False False False False sklearn, cuml 11 GP GaussianProcess GaussianProcessClassifier sklearngaussian_process False False False False False sklearn 12 GBM GradientBoostingMachine GradientBoostingClassifier sklearnensemble False True False False False sklearn 13 hGBM HistGradientBoosting HistGradientBoostingClassifier sklearnensemble False False False False False sklearn 14 KNN KNearestNeighbors KNeighborsClassifier sklearnneighbors True True True True False sklearn, sklearnex, cuml 15 LGB LightGBM LGBMClassifier lightgbmlightgbm.sklearn True True False False True lightgbm 16 LDA LinearDiscriminantAnalysis LinearDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 17 lSVM LinearSVM LinearSVC sklearnsvm True True False False False sklearn, cuml 18 LR LogisticRegression LogisticRegression sklearnlinear_model True True False False False sklearn, sklearnex, cuml 19 MLP MultiLayerPerceptron MLPClassifier sklearnneural_network True True True False True sklearn 20 MNB MultinomialNB MultinomialNB sklearnnaive_bayes False True False False False sklearn, cuml 21 PA PassiveAggressive PassiveAggressiveClassifier sklearnlinear_model True True False False True sklearn 22 Perc Perceptron Perceptron sklearnlinear_model True False False False True sklearn 23 QDA QuadraticDiscriminantAnalysis QuadraticDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 24 RNN RadiusNearestNeighbors RadiusNeighborsClassifier sklearnneighbors True True True True False sklearn 25 RF RandomForest RandomForestClassifier sklearnensemble False True True True False sklearn, sklearnex, cuml 26 Ridge Ridge RidgeClassifier sklearnlinear_model True True True False False sklearn, sklearnex, cuml 27 SGD StochasticGradientDescent SGDClassifier sklearnlinear_model True True False False True sklearn 28 SVM SupportVectorMachine SVC sklearnsvm True True False False False sklearn, sklearnex, cuml 29 XGB XGBoost XGBClassifier xgboostxgboost True True False False True xgboost In\u00a0[13]: Copied!
                                                                                                                                                                                  atom.verbose = 1\n\n# Define a custom metric\ndef f2(y_true, y_pred):\n    return fbeta_score(y_true, y_pred, beta=2)\n\n# Use the greater_is_better, needs_proba and needs_threshold parameters if necessary\natom.run(models=\"LR\", metric=f2)\n
                                                                                                                                                                                  atom.verbose = 1 # Define a custom metric def f2(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Use the greater_is_better, needs_proba and needs_threshold parameters if necessary atom.run(models=\"LR\", metric=f2)
                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR\nMetric: f2\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.5693\nTest evaluation --> f2: 0.5709\nTime elapsed: 0.863s\n-------------------------------------------------\nTime: 0.863s\n\n\nFinal results ==================== >>\nTotal time: 1.491s\n-------------------------------------\nLogisticRegression --> f2: 0.5709\n
                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                  # You can use the est_params parameter to customize the estimator\n# Let's run AdaBoost using LR instead of a decision tree as base estimator\natom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})\n
                                                                                                                                                                                  # You can use the est_params parameter to customize the estimator # Let's run AdaBoost using LR instead of a decision tree as base estimator atom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})
                                                                                                                                                                                  \nTraining ========================= >>\nModels: AdaB\nMetric: f2\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.556\nTest evaluation --> f2: 0.5636\nTime elapsed: 2.568s\n-------------------------------------------------\nTime: 2.568s\n\n\nFinal results ==================== >>\nTotal time: 3.065s\n-------------------------------------\nAdaBoost --> f2: 0.5636\n
                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                  atom.adab.estimator\n
                                                                                                                                                                                  atom.adab.estimator Out[15]:
                                                                                                                                                                                  AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)
                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.AdaBoostClassifier
                                                                                                                                                                                  AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)
                                                                                                                                                                                  base_estimator: LogisticRegression
                                                                                                                                                                                  LogisticRegression(n_jobs=1, random_state=1)
                                                                                                                                                                                  LogisticRegression
                                                                                                                                                                                  LogisticRegression(n_jobs=1, random_state=1)
                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                  # Note that parameters specified by est_params are not optimized in the BO\natom.run(\n    models=\"Tree\",\n    n_trials=10,\n    est_params={\n        \"criterion\": \"gini\",\n        \"splitter\": \"best\",\n        \"min_samples_leaf\": 1,\n        \"ccp_alpha\": 0.035,\n    },\n    verbose=2,\n)\n
                                                                                                                                                                                  # Note that parameters specified by est_params are not optimized in the BO atom.run( models=\"Tree\", n_trials=10, est_params={ \"criterion\": \"gini\", \"splitter\": \"best\", \"min_samples_leaf\": 1, \"ccp_alpha\": 0.035, }, verbose=2, )
                                                                                                                                                                                  \nTraining ========================= >>\nModels: Tree\nMetric: f2\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial | max_depth | min_samples_split | max_features |      f2 | best_f2 | time_trial | time_ht |    state |\n| ----- | --------- | ----------------- | ------------ | ------- | ------- | ---------- | ------- | -------- |\n| 0     |        13 |                12 |          0.5 |  0.4362 |  0.4362 |     3.161s |  3.161s | COMPLETE |\n| 1     |        14 |                16 |         log2 |  0.4729 |  0.4729 |     2.872s |  6.033s | COMPLETE |\n| 2     |        16 |                13 |          0.8 |  0.4626 |  0.4729 |     3.201s |  9.234s | COMPLETE |\n| 3     |         9 |                 6 |         None |  0.4903 |  0.4903 |     3.075s | 12.309s | COMPLETE |\n| 4     |         5 |                 2 |         log2 |  0.4889 |  0.4903 |     2.812s | 15.121s | COMPLETE |\n| 5     |         1 |                15 |          0.5 |  0.4953 |  0.4953 |     2.827s | 17.948s | COMPLETE |\n| 6     |        15 |                 9 |         sqrt |  0.5004 |  0.5004 |     2.951s | 20.899s | COMPLETE |\n| 7     |        13 |                20 |         None |  0.5004 |  0.5004 |     3.242s | 24.141s | COMPLETE |\n| 8     |         3 |                19 |          0.5 |  0.4936 |  0.5004 |     2.800s | 26.941s | COMPLETE |\n| 9     |        15 |                20 |         sqrt |  0.4762 |  0.5004 |     3.170s | 30.111s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 6\nBest parameters:\n --> max_depth: 15\n --> min_samples_split: 9\n --> max_features: sqrt\nBest evaluation --> f2: 0.5004\nTime elapsed: 30.111s\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.4925\nTest evaluation --> f2: 0.4925\nTime elapsed: 0.452s\n-------------------------------------------------\nTime: 30.563s\n\n\nFinal results ==================== >>\nTotal time: 30.885s\n-------------------------------------\nDecisionTree --> f2: 0.4925\n

                                                                                                                                                                                  Note that both instances need to be initialized with the same data and use the same metric for model training to be able to merge.

                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                  tempdir = tempfile.gettempdir()\n
                                                                                                                                                                                  tempdir = tempfile.gettempdir() In\u00a0[18]: Copied!
                                                                                                                                                                                  # Save the atom instance as a pickle\n# Use save_data=False to save the instance without the data\natom.save(tempdir + \"atom\", save_data=False)\n
                                                                                                                                                                                  # Save the atom instance as a pickle # Use save_data=False to save the instance without the data atom.save(tempdir + \"atom\", save_data=False)
                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                  In\u00a0[20]: Copied!
                                                                                                                                                                                  # No need to store the transformed data, providing the original dataset to\n# the loader automatically transforms it through all the steps in the pipeline\natom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))\n
                                                                                                                                                                                  # No need to store the transformed data, providing the original dataset to # the loader automatically transforms it through all the steps in the pipeline atom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))
                                                                                                                                                                                  ATOMClassifier successfully loaded.\n
                                                                                                                                                                                  In\u00a0[21]: Copied!
                                                                                                                                                                                  # Create a separate instance with its own branch and model\natom_3 = ATOMClassifier(X, verbose=0, random_state=1)\natom_3.branch.name = \"lightgbm\"\natom_3.impute()\natom_3.encode()\natom_3.run(\"LGB\", metric=f2)\n
                                                                                                                                                                                  # Create a separate instance with its own branch and model atom_3 = ATOMClassifier(X, verbose=0, random_state=1) atom_3.branch.name = \"lightgbm\" atom_3.impute() atom_3.encode() atom_3.run(\"LGB\", metric=f2) In\u00a0[22]: Copied!
                                                                                                                                                                                  # Merge the instances\natom_2.merge(atom_3)\n
                                                                                                                                                                                  # Merge the instances atom_2.merge(atom_3)
                                                                                                                                                                                  Merging instances...\n --> Merging branch lightgbm.\n --> Merging model LGB.\n --> Merging attributes.\n
                                                                                                                                                                                  In\u00a0[23]: Copied!
                                                                                                                                                                                  # Note that it now contains both branches and all models\natom_2\n
                                                                                                                                                                                  # Note that it now contains both branches and all models atom_2 Out[23]:
                                                                                                                                                                                  ATOMClassifier\n --> Branches:\n   --> main !\n   --> lightgbm\n --> Models: LR, AdaB, Tree, LGB\n --> Metric: f2
                                                                                                                                                                                  In\u00a0[24]: Copied!
                                                                                                                                                                                  atom_2.results\n
                                                                                                                                                                                  atom_2.results Out[24]: f2_train f2_test time_fit time frac model 0.8 AdaB 0.5599 0.5590 2.568021 2.568021 LR 0.5723 0.5685 0.863496 0.863496 Tree 0.4930 0.4928 0.452411 30.563017 1.0 LGB 0.6578 0.5909 3.991159 3.991159"}, {"location": "examples/utilities/#example-utilities", "title": "Example: Utilities\u00b6", "text": "

                                                                                                                                                                                  This example shows various useful utilities that can be used to improve atom's pipelines.

                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                  "}, {"location": "examples/utilities/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-utility-attributes", "title": "Use the utility attributes\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-stats-method-to-assess-changes-in-the-dataset", "title": "Use the stats method to assess changes in the dataset\u00b6", "text": ""}, {"location": "examples/utilities/#inspect-feature-distributions", "title": "Inspect feature distributions\u00b6", "text": ""}, {"location": "examples/utilities/#change-the-data-mid-pipeline", "title": "Change the data mid-pipeline\u00b6", "text": ""}, {"location": "examples/utilities/#get-an-overview-of-the-available-models", "title": "Get an overview of the available models\u00b6", "text": ""}, {"location": "examples/utilities/#use-a-custom-metric", "title": "Use a custom metric\u00b6", "text": ""}, {"location": "examples/utilities/#customize-the-estimators-parameters", "title": "Customize the estimator's parameters\u00b6", "text": ""}, {"location": "examples/utilities/#save-load", "title": "Save & load\u00b6", "text": ""}, {"location": "user_guide/accelerating/", "title": "Accelerating pipelines", "text": "

                                                                                                                                                                                  For very large datasets, ATOM offers various ways to accelerate its pipeline:

                                                                                                                                                                                  • Run estimators on GPU
                                                                                                                                                                                  • Use a faster data engine
                                                                                                                                                                                  • Use a faster estimator engine
                                                                                                                                                                                  • Run processes in parallel

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Performance improvements are usually noticeable for datasets larger than ~5M rows. For smaller datasets, using other values than the default can even harm performance!

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#gpu-acceleration", "title": "GPU acceleration", "text": "

                                                                                                                                                                                  Graphics Processing Units (GPUs) can significantly accelerate calculations for preprocessing steps or training machine learning models. Training models involves compute-intensive matrix multiplications and other operations that can take advantage of a GPU's massively parallel architecture. Training on large datasets can take hours to run on a single processor. However, if you offload those tasks to a GPU, you can reduce training time to minutes instead.

                                                                                                                                                                                  Running transformers and models in atom using a GPU is as easy as initializing the instance with parameter device=\"gpu\". The device parameter accepts any string that follows the SYCL_DEVICE_FILTER filter selector. Examples are:

                                                                                                                                                                                  • device=\"cpu\" (use CPU)
                                                                                                                                                                                  • device=\"gpu\" (use default GPU)
                                                                                                                                                                                  • device=\"gpu:0\" (use first GPU)
                                                                                                                                                                                  • device=\"gpu:1\" (use second GPU)

                                                                                                                                                                                  Combine GPU acceleration with the cuml and sklearnex estimator engines. The XGBoost, LightGBM and CatBoost models come with their own GPU engine. Setting device=\"gpu\" is sufficient to accelerate them with GPU, regardless of the engine parameter.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  ATOM does not support multi-GPU training. If there is more than one GPU on the machine and the device parameter does not specify which one to use, the first one is used by default.

                                                                                                                                                                                  Example

                                                                                                                                                                                  Train a model on a GPU yourself using SageMaker Studio Lab. Just click on the badge above and run the notebook! Make sure to choose the GPU compute type.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#data-acceleration", "title": "Data acceleration", "text": "

                                                                                                                                                                                  The data engine can be specified through the engine parameter, which takes a dict with a key data that accepts three values: numpy, pyarrow and modin.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#numpy", "title": "numpy", "text": "

                                                                                                                                                                                  ATOM uses pandas as the default library for data handling, which in turn, uses numpy for all data processing.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#pyarrow", "title": "pyarrow", "text": "

                                                                                                                                                                                  PyArrow is a library that provides a way to work with Apache Arrow memory structures. Apache Arrow is a cross-language, platform-independent, in-memory data format that provides an efficient and fast way to serialize and deserialize data. Pandas offers native integration with pyarrow, which atom uses when specifying the pyarrow data engine.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The pyarrow backend doesn't work for sparse datasets. If the dataset has any sparse columns, an exception is raised.
                                                                                                                                                                                  • The LightGBM and XGBoost models don't support pyarrow dtypes.
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#modin", "title": "modin", "text": "

                                                                                                                                                                                  The modin library is a multi-threading, drop-in replacement for pandas, that uses Ray as backend.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#estimator-acceleration", "title": "Estimator acceleration", "text": "

                                                                                                                                                                                  The estimator engine can be specified through the engine parameter, which takes a dict with a key estimator that accepts three values: sklearn, sklearnex and cuml. Read here how to run the estimators on GPU instead of CPU.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Estimators accelerated with sklearnex or cuML sometimes use slightly different hyperparameters than their sklearn counterparts.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#sklearn", "title": "sklearn", "text": "

                                                                                                                                                                                  This is the default option, which uses the standard estimators from sklearn. Sklearn does not support training on GPU.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#sklearnex", "title": "sklearnex", "text": "

                                                                                                                                                                                  The Intel\u00ae Extension for Scikit-learn package (or sklearnex, for brevity) accelerates sklearn models and transformers, keeping full conformance with sklearn's API. Sklearnex is a free software AI accelerator that offers a way to make sklearn code 10\u2013100 times faster. The software acceleration is achieved through the use of vector instructions, IA hardware-specific memory optimizations, threading, and optimizations for all upcoming Intel platforms at launch time. See here an example using the sklearnex engine.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  sklearnex estimators don't support sparse datasets nor multioutput tasks.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Intel\u00ae processors provide better performance than other CPUs.

                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#prerequisites", "title": "Prerequisites", "text": "
                                                                                                                                                                                  • Operating System:
                                                                                                                                                                                    • Linux (Ubuntu, Fedora, etc...)
                                                                                                                                                                                    • Windows 8.1+
                                                                                                                                                                                    • macOS (no GPU support)
                                                                                                                                                                                  • CPU:
                                                                                                                                                                                    • Processor must have x86 architecture.
                                                                                                                                                                                    • Processor must support at least one of SSE2, AVX, AVX2, AVX512 instruction sets.
                                                                                                                                                                                    • ARM* architecture is not supported.
                                                                                                                                                                                  • GPU:
                                                                                                                                                                                    • All Intel\u00ae integrated and discrete GPUs.
                                                                                                                                                                                    • Intel\u00ae GPU drivers.
                                                                                                                                                                                  • Libraries:
                                                                                                                                                                                    • sklearnex>=2023.2.1 (automatically installed with atom when the processor has x86 architecture)
                                                                                                                                                                                    • dpcpp_cpp_rt>=2023.2 (only for GPU acceleration)
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#supported-estimators", "title": "Supported estimators", "text": "
                                                                                                                                                                                  • Pruner (only for strategy=\"dbscan\")
                                                                                                                                                                                  • FeatureSelector (only for strategy=\"pca\" and dense datasets)

                                                                                                                                                                                  • ElasticNet (only for CPU acceleration)

                                                                                                                                                                                  • KNearestNeighbors
                                                                                                                                                                                  • Lasso (only for CPU acceleration)
                                                                                                                                                                                  • LogisticRegression
                                                                                                                                                                                  • OrdinaryLeastSquares
                                                                                                                                                                                  • RandomForest
                                                                                                                                                                                  • Ridge (only for regression tasks and CPU acceleration)
                                                                                                                                                                                  • SupportVectorMachine (GPU acceleration only supports classification tasks)
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#cuml", "title": "cuML", "text": "

                                                                                                                                                                                  cuML is the machine learning library of the RAPIDS project. cuML enables you to run traditional tabular ML tasks on GPUs without going into the details of CUDA programming. For large datasets, these GPU-based implementations can complete 10-50x faster than their CPU equivalents.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • cuML estimators don't support multioutput tasks nor the pyarrow data engine.
                                                                                                                                                                                  • Install cuML using pip install --extra-index-url=https://pypi.nvidia.com cuml-cu11 or pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12 depending on your CUDA version. Read more about RAPIDS' installation here.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Only transformers and predictors are converted to the requested engine. To use a metric from cuML, insert it directly in the run method:

                                                                                                                                                                                  from atom import ATOMClassifier\nfrom cuml.metrics import accuracy_score\nfrom sklearn.datasets import make_classification\n\nX, y = make_classification(n_samples=100, random_state=1)\n\natom = ATOMClassifier(X, y, engine={\"estimator\": \"cuml\"}, verbose=2)\natom.run(\"LR\", metric=accuracy_score)\n
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#prerequisites_1", "title": "Prerequisites", "text": "
                                                                                                                                                                                  • Operating System:
                                                                                                                                                                                    • Ubuntu 18.04/20.04 or CentOS 7/8 with gcc/++ 9.0+
                                                                                                                                                                                    • Windows 10+ with WSL2 (see here a tutorial)
                                                                                                                                                                                  • GPU:
                                                                                                                                                                                    • NVIDIA Pascal\u2122 or better with compute capability 6.0+
                                                                                                                                                                                  • Drivers:
                                                                                                                                                                                    • CUDA & NVIDIA Drivers of versions 11.0, 11.2, 11.4 or 11.5
                                                                                                                                                                                  • Libraries:
                                                                                                                                                                                    • cuML>=23.08
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#supported-estimators_1", "title": "Supported estimators", "text": "
                                                                                                                                                                                  • Cleaner
                                                                                                                                                                                  • Discretizer
                                                                                                                                                                                  • Imputer (only for strat_num!=\"knn\")
                                                                                                                                                                                  • Normalizer
                                                                                                                                                                                  • Pruner (only for strategy=\"dbscan\" and \"hdbscan\")
                                                                                                                                                                                  • Scaler
                                                                                                                                                                                  • Vectorizer
                                                                                                                                                                                  • FeatureSelector (only for strategy=\"pca\")

                                                                                                                                                                                  • BernoulliNB

                                                                                                                                                                                  • CategoricalNB
                                                                                                                                                                                  • ElasticNet
                                                                                                                                                                                  • GaussianNB
                                                                                                                                                                                  • KNearestNeighbors
                                                                                                                                                                                  • Lasso
                                                                                                                                                                                  • LinearSVM
                                                                                                                                                                                  • LogisticRegression
                                                                                                                                                                                  • MultinomialNB
                                                                                                                                                                                  • OrdinaryLeastSquares
                                                                                                                                                                                  • RandomForest
                                                                                                                                                                                  • Ridge (only for regression tasks)
                                                                                                                                                                                  • SupportVectorMachine
                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#parallel-execution", "title": "Parallel execution", "text": "

                                                                                                                                                                                  Another way to accelerate your pipelines is executing processes in parallel. Use the backend parameter to select one of several parallelization backends.

                                                                                                                                                                                  • loky: Used by default, can induce some communication and memory overhead when exchanging input and output data with the worker Python processes. On some rare systems (such as Pyiodide), the loky backend may not be available.
                                                                                                                                                                                  • multiprocessing: Previous process-based backend based on multiprocessing.Pool. Less robust than loky.
                                                                                                                                                                                  • threading: Very low-overhead backend but it suffers from the Python Global Interpreter Lock if the called function relies a lot on Python objects. It's mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a \"with nogil\" block or an expensive call to a library such as numpy).
                                                                                                                                                                                  • ray: Ray is an open-source unified compute framework that makes it easy to scale AI and Python workloads. Read more about Ray here. See here an example use case.

                                                                                                                                                                                  The parallelization backend is applied in the following cases:

                                                                                                                                                                                  • In every individual estimator that uses parallelization internally.
                                                                                                                                                                                  • To calculate cross-validated results during hyperparameter tuning.
                                                                                                                                                                                  • To train multiple models in parallel (when the trainer's parallel parameter is True).
                                                                                                                                                                                  • To calculate partial dependencies in plot_partial_dependence.

                                                                                                                                                                                  Note

                                                                                                                                                                                  The njobs parameter sets the number of cores for the individual models as well as for parallel training. You won't gain much training two models in parallel with 2 cores, when the models also parallelize computations internally. Instead, use parallel training for models that can't parallelize their training (their constructor doesn't have the n_jobs parameter).

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/", "title": "Data cleaning", "text": "

                                                                                                                                                                                  More often than not, you'll need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • All of atom's data cleaning methods automatically adopt the relevant transformer attributes (n_jobs, verbose, logger, random_state) from atom. A different choice can be added as parameter to the method call, e.g., atom.scale(verbose=2).
                                                                                                                                                                                  • Like the add method, the data cleaning methods accept the columns parameter to only transform a subset of the dataset's features, e.g., atom.scale(columns=[0, 1]). Read more in the row and column selection section.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#balancing-the-data", "title": "Balancing the data", "text": "

                                                                                                                                                                                  One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud, and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority class or undersample the majority class using any of the transformers implemented in the imblearn package. It can be accessed from atom through the balance method.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#standard-data-cleaning", "title": "Standard data cleaning", "text": "

                                                                                                                                                                                  There are many data cleaning steps that are useful to perform on any dataset before modeling. These are general rules that apply almost on every use-case and every task. The Cleaner class is a convenient tool to apply such steps. It can be accessed from atom through the clean method. Use the class' parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                  • Strip categorical features from white spaces.
                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                  • Encode the target column.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#binning-numerical-features", "title": "Binning numerical features", "text": "

                                                                                                                                                                                  Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values. Certain datasets with continuous features may benefit from discretization, because discretization can transform the dataset of continuous attributes to one with only nominal attributes. Discretization is similar to constructing histograms for continuous data. However, histograms focus on counting features which fall into particular bins, whereas discretization focuses on assigning feature values to these bins. The Discretizer class can be used to bin continuous data into intervals. It can be accessed from atom through the discretize method.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#encoding-categorical-features", "title": "Encoding categorical features", "text": "

                                                                                                                                                                                  Many datasets contain categorical features. Their variables are typically stored as text values which represent various classes. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. The majority of sklearn's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method.

                                                                                                                                                                                  There are many strategies to encode categorical columns. The Encoder class applies one strategy or another depending on the number of classes in the column to be encoded. When there are only two, the values are encoded with 0 or 1. When there are more than two, the columns can be encoded using one-hot encoding or any other strategy of the category-encoders package, depending on the value of the max_onehot parameter. One-hot encodes the column making a dummy feature for every class. This approach preserves all the information but increases the size of the dataset considerably, making it often an undesirable strategy for high cardinality features. Other strategies like Target transform the column in place.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#imputing-missing-values", "title": "Imputing missing values", "text": "

                                                                                                                                                                                  For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#normalizing-the-feature-set", "title": "Normalizing the feature set", "text": "

                                                                                                                                                                                  Use the Normalizer class to transform the feature set to follow a Normal (Gaussian)-like distribution. In general, data must be transformed when using models that assume normality in the residuals. Examples of such models are LogisticRegression, LinearDiscriminantAnalysis and GaussianNB. The class can be accessed from atom through the normalize method.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#handling-outliers", "title": "Handling outliers", "text": "

                                                                                                                                                                                  When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier samples. The Pruner class offers 7 different strategies to detect outliers (described hereunder). It can be accessed from atom through the prune method.

                                                                                                                                                                                  z-score The z-score of a value in the dataset is defined as the number of standard deviations by which the value is above or below the mean of the column. Values above or below a certain threshold (specified with the parameter max_sigma) are considered outliers. Note that, contrary to the rest of the strategies, this approach selects outlier values, not outlier samples! Because of this, it is possible to replace the outlier value instead of dropping the entire sample.

                                                                                                                                                                                  Isolation Forest Uses a tree-based anomaly detection algorithm. It is based on modeling the normal data in such a way as to isolate anomalies that are both few and different in the feature space. Read more in sklearn's documentation.

                                                                                                                                                                                  Elliptic Envelope If the input variables have a Gaussian distribution, then simple statistical methods can be used to detect outliers. For example, if the dataset has two input variables and both are Gaussian, the feature space forms a multidimensional Gaussian, and knowledge of this distribution can be used to identify values far from the distribution. This approach can be generalized by defining a hypersphere (ellipsoid) that covers the normal data, and data that falls outside this shape is considered an outlier. Read more in sklearn's documentation.

                                                                                                                                                                                  Local Outlier Factor A simple approach to identifying outliers is to locate those examples that are far from the other examples in the feature space. This can work well for feature spaces with low dimensionality (few features) but becomes less reliable as the number of features is increased. The local outlier factor is a technique that attempts to harness the idea of nearest neighbors for outlier detection. Each example is assigned a score of how isolated or how likely it is to be outliers based on the size of its local neighborhood. Those examples with the largest score are more likely to be outliers. Read more in sklearn's documentation.

                                                                                                                                                                                  One-class SVM The support vector machine algorithm, initially developed for binary classification tasks, can also be used for one-class classification. When modeling one class, the algorithm captures the density of the majority class and classifies examples on the extremes of the density function as outliers. This modification of SVM is referred to as One-Class SVM. Read more in sklearn's documentation.

                                                                                                                                                                                  DBSCAN The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. Due to this rather generic view, clusters found by DBSCAN can be any shape, as opposed to k-means which assumes that clusters are convex shaped. Samples that lie outside any cluster are considered outliers. Read more in sklearn's documentation.

                                                                                                                                                                                  OPTICS The OPTICS algorithm shares many similarities with the DBSCAN algorithm, and can be considered a generalization of DBSCAN that relaxes the eps requirement from a single value to a value range. The key difference between DBSCAN and OPTICS is that the OPTICS algorithm builds a reachability graph, and a spot within the cluster ordering. These two attributes are assigned when the model is fitted, and are used to determine cluster membership. Read more in sklearn's documentation.

                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#scaling-the-feature-set", "title": "Scaling the feature set", "text": "

                                                                                                                                                                                  Standardization of a dataset is a common requirement for many machine learning estimators; they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with zero mean and unit variance). The Scaler class let you quickly scale atom's dataset using one of sklearn's scalers. It can be accessed from atom through the scale method.

                                                                                                                                                                                  Info

                                                                                                                                                                                  All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/", "title": "Data management", "text": ""}, {"location": "user_guide/data_management/#data-sets", "title": "Data sets", "text": "

                                                                                                                                                                                  ATOM is designed to work around one single dataset: the one with which atom is initialized. This is the dataset you want to explore, transform, and use for model training and validation. ATOM differentiates three different data sets:

                                                                                                                                                                                  • The training set is usually the largest of the data sets. As the name suggests, this set is used to train the pipeline. During hyperparameter tuning, only the training set is used to fit and evaluate the estimator in every call. The training set in the current branch can be accessed through the train attribute. It's features and target can be accessed through X_train and y_train respectively.
                                                                                                                                                                                  • The test set is used to evaluate the models. The model scores on this set give an indication on how the model performs on new data. The test set can be accessed through the test attribute. It's features and target can be accessed through X_test and y_test respectively.
                                                                                                                                                                                  • The holdout set is an optional, separate set that should only be used to evaluate the final model's performance. Create this set when you are going to use the test set for an intermediate validation step. The holdout set is immediately set apart during initialization and is not considered part of atom's dataset (the dataset attribute only returns the train and test sets). The holdout set is left untouched until predictions are made on it, i.e., it does not undergo any pipeline transformations until the data set is requested for the first time. The holdout set is stored in atom's holdout attribute. See herean example that shows how to use the holdout data set.

                                                                                                                                                                                  The data can be provided in different formats. If the data sets are not specified beforehand, you can input the features and target separately or together:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y

                                                                                                                                                                                  Remember to use the y parameter to indicate the target column in X when using the first option. If not specified, the last column in X is used as target. In both these cases, the size of the sets are defined using the test_size and holdout_size parameters. Note that the splits are made after the subsample of the dataset with the n_rows parameter (when not left to its default value).

                                                                                                                                                                                  If you already have the separate data sets, provide them using one of the following formats:

                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  The input data is always converted internally to a dataframe, if it isn't one already. The column names should always be strings. If they are not, atom changes their type at initialization. If no column names are provided, default names are given of the form X[N-1], where N stands for the n-th feature in the dataset.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#indexing", "title": "Indexing", "text": "

                                                                                                                                                                                  By default, atom resets the dataframe's index after initialization and after every transformation in the pipeline. To avoid this, specify the index parameter. If the dataset has an 'identifier' column, it is useful to use it as index for two reasons:

                                                                                                                                                                                  • An identifier doesn't usually contain any useful information on the target column, and should therefore be removed before training.
                                                                                                                                                                                  • Predictions of specific rows can be accessed through their index.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Avoid duplicate indices in the dataframe. Having them raises an error when initializing atom and may potentially lead to unexpected behavior if introduced later.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#sparse-datasets", "title": "Sparse datasets", "text": "

                                                                                                                                                                                  If atom is initialized using a scipy sparse matrix, it is converted internally to a dataframe of sparse columns. Read more about pandas' sparse data structures here. The same conversion takes place when a transformer returns a sparse matrix, like for example, the Vectorizer.

                                                                                                                                                                                  Note that ATOM considers a dataset to be sparse if any of the columns is sparse. A dataset can only benefit from sparsity when all its columns are sparse, hence mixing sparse and non-sparse columns is not recommended and can cause estimators to decrease their training speed or even crash. Use the shrink method to convert dense features to sparse and the available_models method to check which models have native support for sparse matrices.

                                                                                                                                                                                  Click here to see an example that uses sparse data.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Estimators accelerated using sklearnex don't support sparse datasets.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multioutput-tasks", "title": "Multioutput tasks", "text": "

                                                                                                                                                                                  Multioutput is a task where there are more than one target column, i.e., the goal is to predict multiple targets at the same time. When providing a dataframe as target, use the y parameter. Providing y without keyword makes ATOM think you are providing train, test (see the data sets section).

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#task-types", "title": "Task types", "text": "

                                                                                                                                                                                  ATOM recognizes four multioutput tasks.

                                                                                                                                                                                  Note

                                                                                                                                                                                  Combinations of binary and multiclass target columns are treated as multiclass-multioutput tasks.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multilabel", "title": "Multilabel", "text": "

                                                                                                                                                                                  Multilabel is a classification task, labeling each sample with m labels from n_classes possible classes, where m can be 0 to n_classes inclusive. This can be thought of as predicting properties of a sample that are not mutually exclusive.

                                                                                                                                                                                  For example, prediction of the topics relevant to a text document. The document may be about one of religion, politics, finance or education, several of the topic classes or all of the topic classes. The target column (atom.y) could look like this:

                                                                                                                                                                                  0                        [politics]\n1               [religion, finance]\n2    [politics, finance, education]\n3                                []\n4                         [finance]\n5               [finance, religion]\n6                         [finance]\n7               [religion, finance]\n8                       [education]\n9     [finance, religion, politics]\n\nName: target, dtype: object\n

                                                                                                                                                                                  A model can not directly ingest a variable amount of target classes. Use the clean method to assign a binary output to each class, for every sample. Positive classes are indicated with 1 and negative classes with 0. It is thus comparable to running n_classes binary classification tasks. In our example, the target (atom.y) is converted to:

                                                                                                                                                                                     education  finance  politics  religion\n0          0        0         1         0\n1          0        1         0         1\n2          1        1         1         0\n3          0        0         0         0\n4          0        1         0         0\n5          0        1         0         1\n6          0        1         0         0\n7          0        1         0         1\n8          1        0         0         0\n9          0        1         1         1\n
                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multiclass-multioutput", "title": "Multiclass-multioutput", "text": "

                                                                                                                                                                                  Multiclass-multioutput (also known as multitask classification) is a classification task which labels each sample with a set of non-binary properties. Both the number of properties and the number of classes per property is greater than 2. A single estimator thus handles several joint classification tasks. This is both a generalization of the multilabel classification task, which only considers binary attributes, as well as a generalization of the multiclass classification task, where only one property is considered.

                                                                                                                                                                                  For example, classification of the properties \"type of fruit\" and \"colour\" for a set of images of fruit. The property \"type of fruit\" has the possible classes: \"apple\", \"pear\" and \"orange\". The property \"colour\" has the possible classes: \"green\", \"red\", \"yellow\" and \"orange\". Each sample is an image of a fruit, a label is output for both properties and each label is one of the possible classes of the corresponding property.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multioutput-regression", "title": "Multioutput regression", "text": "

                                                                                                                                                                                  Multioutput regression predicts multiple numerical properties for each sample. Each property is a numerical variable and the number of properties to be predicted for each sample is >= 2. Some estimators that support multioutput regression are faster than just running n_output estimators.

                                                                                                                                                                                  For example, prediction of both wind speed and wind direction, in degrees, using data obtained at a certain location. Each sample would be data obtained at one location and both wind speed and direction would be output for each sample.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multivariate", "title": "Multivariate", "text": "

                                                                                                                                                                                  Multivariate is the multioutput task for forecasting. In this case, we try to forecast more than one time series at the same time.

                                                                                                                                                                                  Although all forecasting models in ATOM support multivariate tasks, we differentiate two types of models:

                                                                                                                                                                                  • The \"native multivariate\" models apply forecasts where every prediction of endogeneous (y) variables will depend on values of the other target columns.
                                                                                                                                                                                  • The rest of the models apply an estimator per column, meaning that forecasts will be made per endogeneous variable, and not be affected by other variables. To access the column-wise estimators, use the estimator's forecasters_ parameter, which stores the fitted forecasters in a dataframe.

                                                                                                                                                                                  Read more about time series tasks here.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#native-multioutput-models", "title": "Native multioutput models", "text": "

                                                                                                                                                                                  Some models have native support for multioutput tasks. This means that the original estimator is used to make predictions directly on all the target columns. Examples of such models are KNearestNeighbors, RandomForest and ExtraTrees.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#non-native-multioutput-models", "title": "Non-native multioutput models", "text": "

                                                                                                                                                                                  The majority of the models don't have integrated support for multioutput tasks. However, it's possible to still use them for such tasks, wrapping them in a meta-estimator capable of handling multiple target columns. For non-native multioutput models, ATOM does so automatically. For multilabel tasks, the meta-estimator is:

                                                                                                                                                                                  • ClassifierChain

                                                                                                                                                                                  And for multiclass-multioutput and multioutput regression, the meta-estimators are respectively:

                                                                                                                                                                                  • MultioutputClassifier
                                                                                                                                                                                  • MultioutputRegressor

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Currently, scikit-learn metrics do not support multiclass-multioutput classification tasks. In this case, ATOM calculates the mean of the selected metric over every individual target.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  • Set the native_multilabel or native_multioutput parameter in ATOMModel equal to True to ignore the meta-estimator for custom models.
                                                                                                                                                                                  • Check out the multilabel classification and multioutput regression examples.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#branches", "title": "Branches", "text": "

                                                                                                                                                                                  You might want to compare how a model performs on a dataset transformed through multiple pipelines, each using different transformers. For example, on one pipeline with an undersampling strategy and the other with an oversampling strategy. To be able to do this, ATOM has a branching system.

                                                                                                                                                                                  The branching system helps the user to manage multiple data pipelines within the same atom instance. Branches are created and accessed through atom's branch property. A branch contains a specific pipeline, the dataset transformed through that pipeline, and all data and utility attributes that refer to that dataset. Transformers and models called from atom use the dataset in the current branch, as well as data attributes such as atom.dataset. It's not allowed to change the data in a branch after fitting a model with it. Instead, create a new branch for every unique pipeline.

                                                                                                                                                                                  By default, atom starts with one branch called \"main\". To start a new branch, set a new name to the property, e.g., atom.branch = \"undersample\". This creates a new branch from the current one. To create a branch from any other branch type \"_from_\" between the new name and the branch from which to split, e.g., atom.branch = \"oversample_from_main\" creates branch \"oversample\" from branch \"main\", even if the current branch is \"undersample\". To switch between existing branches, just type the name of the desired branch, e.g., atom.branch = \"main\" brings you back to the main branch. Note that every branch contains a unique copy of the whole dataset! Creating many branches can cause memory issues for large datasets.

                                                                                                                                                                                  See the Imbalanced datasets or Feature engineering examples for branching use cases.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Always create a new branch if you want to change the dataset after fitting a model! Forcing a data change through the data property's @setter can cause unexpected model behavior and break down the plotting methods.

                                                                                                                                                                                  Figure 1. Diagram of a possible branch system to compare an oversampling with an undersampling pipeline.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#memory-considerations", "title": "Memory considerations", "text": "

                                                                                                                                                                                  An atom instance stores one copy of the dataset for each branch (this doesn't include the holdout set, which is only stored once), and one copy of the initial dataset with which the instance is initialized. This copy of the original dataset is necessary to avoid data leakage during hyperparameter tuning and for some specific methods like cross_validate and reset. It's created as soon as there are no branches in the initial state (usually after calling the first data transformation). If the dataset is occupying too much memory, consider using the shrink method to convert the dtypes to their smallest possible matching dtype.

                                                                                                                                                                                  When working with large datasets and multiple branches, it becomes impossible to store all branches in memory at the same time. To avoid out-of-memory errors, use atom's memory parameter. If not False, atom saves the data of inactive branches as well as the original branch at the specified location (in a directory called joblib, the name of the underlying library managing the caching), maintaining only the current active branch in memory. This mechanism results in a slight drop in performance because of the I/O overhead, but can save a lot of memory. Additionally, the memory's location is also used to cache the output of the fit method of transformers in the pipeline. See here an example using the memory parameter.

                                                                                                                                                                                  Apart from the dataset itself, a model's metric scores and shap values are also stored as attributes of the model to avoid having to recalculate them every time they are needed. You can delete all these attributes using the clear method in order to free some memory before saving atom.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#data-transformations", "title": "Data transformations", "text": "

                                                                                                                                                                                  Performing data transformations is a common requirement of many datasets before they are ready to be ingested by a model. ATOM provides various classes to apply data cleaning and feature engineering transformations to the data. This tooling should be able to help you apply most of the typically needed transformations to get the data ready for modeling. For further fine-tuning, it's also possible to transform the data using custom transformers (see the add method) or through a function (see the apply method). Remember that all transformations are only applied to the dataset in the current branch.

                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#row-and-column-selection", "title": "Row and column selection", "text": "

                                                                                                                                                                                  Many methods in atom contain the rows or columns parameter to select a subset of the dataset. Examples are the evaluate and save_data methods for rows, and the distribution and shrink methods for columns. All data cleaning and feature engineering methods use the columns parameter to apply the transformation only to that selection of columns, and all prediction methods use the rows parameter to make predictions on that selection of rows.

                                                                                                                                                                                  As you can see, these two parameters are very important and shared across many methods in atom. Rows and columns can be selected in multiple ways. The check is performed in the order described hereunder:

                                                                                                                                                                                  1. By actual dataset, e.g., rows=atom.test is equal to rows=\"test\".
                                                                                                                                                                                  2. By range or slice, e.g., rows=range(100) to select the first 100 rows from the dataset or rows=slice(20, 100) to select rows 20 to 99.
                                                                                                                                                                                  3. By exact name, e.g., rows=[\"row1\", \"row2\"] to select rows with indices row1 and row2 or columns=[\"col1\", \"col2\"] to select columns col1 and col2. It's also possible to use the + sign to select multiple rows or columns, e.g., columns=\"col1+col2 is the same as columns=[\"col1\", \"col2\"].
                                                                                                                                                                                  4. By position, e.g., rows=[0, 1, 2] to select the first three rows.
                                                                                                                                                                                  5. By name of the data set (only for rows), e.g., rows=\"train\" to select all rows in the training set, or rows=\"test+holdout\" to select all rows in the test and holdout sets. Valid data sets are dataset, train, test and holdout.
                                                                                                                                                                                  6. By dtype (only for columns), e.g., columns=\"number\" to select only numerical columns. See pandas' user guide.
                                                                                                                                                                                  7. By regex match, e.g., columns=\"mean_.*\" to select all columns starting with mean_.
                                                                                                                                                                                  8. Excluding instead of including using the ! sign, e.g. columns=\"!col1\" to select all columns except col1. You can also exclude multiple rows or columns like this columns=[\"!col1\", \"!col2\"] or this columns=\"!col1+!col2\". It's also possible to exclude data sets for row selection, e.g., columns=\"!train\" or dtypes for column selection, e.g., columns=\"!number\". Note that if a column name starts with !, the selection of that name will take priority over exclusion. Rows and columns can only be included or excluded, and not both at the same time. For example, this selection raises an exception column=[\"col1\", \"!col2\"].

                                                                                                                                                                                  Info

                                                                                                                                                                                  In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the rows parameter for every line you want to draw, e.g., atom.plot_roc(rows=(\"train\", \"test\")), or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200}). Note that for these methods, using atom.plot_roc(rows=\"train+test\"), only plots one line with the data from both sets. See the advanced plotting example.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/", "title": "Feature engineering", "text": "

                                                                                                                                                                                  Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't have on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e., they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See the Feature engineering example.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • All of atom's feature engineering methods automatically adopt the relevant transformer attributes (n_jobs, verbose, logger, random_state) from atom. A different choice can be added as parameter to the method call, e.g., atom.feature_selection(\"pca\", n_features=10, random_state=2).
                                                                                                                                                                                  • Like the add method, the feature engineering methods accept the columns parameter to only transform a subset of the dataset's features, e.g., atom.feature_selection(\"pca\",n_features=10, columns=slice(5, 15)). Read more in the row and column selection section.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#extracting-datetime-features", "title": "Extracting datetime features", "text": "

                                                                                                                                                                                  Features that contain dates or timestamps can not be directly ingested by models since they are not strictly numerical. Encoding them as categorical features is not an option since the encoding does not capture the relationship between the different moments in time. The FeatureExtractor class creates new features extracting datetime elements (e.g., day, month, year, hour...) from the columns. It can be accessed from atom through the feature_extraction method. The new features are named equally to the column from which they are extracted, followed by an underscore and the datetime element they create, e.g., x0_day for the day element of x0.

                                                                                                                                                                                  Note that many time features have a cyclic pattern, e.g., after Sunday comes Monday. This means that if we would encode the days of the week from 0 to 6, we would lose that relation. A common method used to encode cyclical features is to transform the data into two dimensions using a sine and cosine transformation:

                                                                                                                                                                                  \\[ x_{sin} = sin\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] \\[ x_{cos} = cos\\left(\\frac{2\\pi * x}{max(x)}\\right) \\]

                                                                                                                                                                                  The resulting features have their names followed by sin or cos, e.g. x0_day_sin and x0_day_cos. The datetime elements that can be encoded in a cyclic fashion are: microsecond, second, minute, hour, weekday, day, day_of_year, month and quarter. Note that decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos values are expected to be considered as one single coordinate system.

                                                                                                                                                                                  Use the fmt parameter to specify your feature's format in case the column is categorical. The FeatureExtractor class will convert the column to the datetime dtype before extracting the specified features. Click here for an overview of the available formats.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#generating-new-features", "title": "Generating new features", "text": "

                                                                                                                                                                                  The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation.

                                                                                                                                                                                  Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is \"log\", it will create the new feature LOG(old_feature) and if the operator is \"mul\", it will create the new feature old_feature_1 x old_feature_2. The operators can be chosen through the operators parameter. Choose from:

                                                                                                                                                                                  • add: Take the sum of two features.
                                                                                                                                                                                  • sub: Subtract two features from each other.
                                                                                                                                                                                  • mul: Multiply two features with each other.
                                                                                                                                                                                  • div: Divide two features with each other.
                                                                                                                                                                                  • abs: Calculate the absolute value of a feature.
                                                                                                                                                                                  • srqt: Calculate the square root of a feature.
                                                                                                                                                                                  • log: Calculate the natural logarithm of a feature.
                                                                                                                                                                                  • sin: Calculate the sine of a feature.
                                                                                                                                                                                  • cos: Calculate the cosine of a feature.
                                                                                                                                                                                  • tan: Calculate the tangent of a feature.

                                                                                                                                                                                  ATOM's implementation of DFS uses the featuretools package.

                                                                                                                                                                                  Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming, a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where dfs can be seen as some kind of \"brute force\" for feature engineering, gfg tries to improve its features with every generation of the algorithm. gfg uses the same operators as dfs, but instead of only applying the transformations once, it evolves them further, creating nested structures of combinations of features. The new features are given the name feature_n, where n stands for the n-th feature in the dataset. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute.

                                                                                                                                                                                  ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#grouping-similar-features", "title": "Grouping similar features", "text": "

                                                                                                                                                                                  When your dataset contains many similar features corresponding to a certain natural group or entity, it's possible to replace these features for a handful of them, that should capture the relations of the group, in order to lose as little information as possible. To achieve this, the FeatureGrouper class computes certain statistical properties that describe the group's distribution, like the mean or the median, and replaces the columns with the result of these statistical calculations over every row in the dataset. The goal of this approach is to reduce the number of columns in the dataset, avoiding the curse of dimensionality.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#selecting-useful-features", "title": "Selecting useful features", "text": "

                                                                                                                                                                                  The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#standard-strategies", "title": "Standard strategies", "text": "

                                                                                                                                                                                  Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation.

                                                                                                                                                                                  Principal Components Analysis Applying PCA reduces the dimensionality of the dataset by maximizing the variance of each dimension. The new features are called pca0, pca1, etc... PCA can be applied in three ways:

                                                                                                                                                                                  • If the data is dense (i.e., not sparse), the estimator used is PCA. Before fitting the transformer, the data is scaled to mean=0 and std=1 if it wasn't already. Read more in sklearn's documentation.
                                                                                                                                                                                  • If the data is [sparse][sparse datasets] (often the case for term-document matrices, see Vectorizer), the estimator used is TruncatedSVD. Read more in sklearn's documentation.
                                                                                                                                                                                  • If engine is \"sklearnex\" or \"cuml\", the estimator used is the package's PCA implementation. Sparse data is not supported for neither engine.

                                                                                                                                                                                  Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its predefined models, e.g., solver=\"RF\". If you didn't call the FeatureSelector through atom, don't forget to indicate the estimator's task adding _class or _reg after the name, e.g., RF_class to use a random forest classifier. Read more in sklearn's documentation.

                                                                                                                                                                                  Sequential Feature Selection Sequential feature selection adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. Read more in sklearn's documentation.

                                                                                                                                                                                  Recursive Feature Elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features, and the importance of each feature is obtained either through a coef_ or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow.

                                                                                                                                                                                  RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV) to assess every step's performance. Also, where RFE returns the number of features selected by n_features, RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features. Read more in sklearn's documentation.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#advanced-strategies", "title": "Advanced strategies", "text": "

                                                                                                                                                                                  The following strategies are a collection of nature-inspired optimization algorithms that maximize an objective function. If not manually specified, the function calculates the cross-validated score of a model on the data. Use the scoring parameter (not present in description, part of kwargs) to specify the metric to optimize on.

                                                                                                                                                                                  Particle Swarm Optimization Particle Swarm Optimization (PSO) optimizes a problem by having a population of candidate solutions (particles), and moving them around in the search-space according to simple mathematical formula over the particle's position and velocity. Each particle's movement is influenced by its local best known position, but is also guided toward the best known positions in the search-space, which are updated as better positions are found by other particles. This is expected to move the swarm toward the best solutions. Read more here.

                                                                                                                                                                                  Harris Hawks Optimization Harris Hawks Optimization (HHO) mimics the action and reaction of Hawk's team collaboration hunting in nature and prey escaping to discover the solutions of the single-objective problem. Read more here.

                                                                                                                                                                                  Grey Wolf Optimization The Grey Wolf Optimizer (GWO) mimics the leadership hierarchy and hunting mechanism of grey wolves in nature. Four types of grey wolves such as alpha, beta, delta, and omega are employed for simulating the leadership hierarchy. In addition, three main steps of hunting, searching for prey, encircling prey, and attacking prey, are implemented to perform optimization. Read more here.

                                                                                                                                                                                  Dragonfly Optimization The Dragonfly Algorithm (DFO) algorithm originates from static and dynamic swarming behaviours. These two swarming behaviours are very similar to the two main phases of optimization using meta-heuristics: exploration and exploitation. Dragonflies create sub swarms and fly over different areas in a static swarm, which is the main objective of the exploration phase. In the static swarm, however, dragonflies fly in bigger swarms and along one direction, which is favourable in the exploitation phase. Read more here.

                                                                                                                                                                                  Genetic Optimization Genetic Optimization is a metaheuristic inspired by the process of natural selection that belongs to the larger class of evolutionary algorithms. Genetic algorithms are commonly used to generate high-quality solutions to optimization and search problems by relying on biologically inspired operators such as mutation, crossover and selection. Read more here.

                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#other-selection-methods", "title": "Other selection methods", "text": "

                                                                                                                                                                                  Removing features with low or high variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model can't learn much from them. In a similar way, features with very high variance have very few values repeated, which makes it also difficult for a model to learn from this feature.

                                                                                                                                                                                  FeatureSelector removes a categorical feature when the maximum number of occurrences for any value is below min_repeated or when the same value is repeated in at least max_repeated fraction of the rows. The default option is to remove a feature if all values in it are either different or exactly the same.

                                                                                                                                                                                  Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e., two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute.

                                                                                                                                                                                  "}, {"location": "user_guide/introduction/", "title": "Introduction", "text": "

                                                                                                                                                                                  There is no magic formula in data science that can tell us which type of machine learning estimator in combination with which pipeline will perform best for a given raw dataset. Different models are better suited for different types of data and different types of problems. You can follow some rough guide on how to approach problems with regard to which model to try, but these are incomplete at best.

                                                                                                                                                                                  During the exploration phase of a machine learning project, a data scientist tries to find the optimal pipeline for his specific use case. This usually involves applying standard data cleaning steps, creating or selecting useful features, trying out different models, etc. Testing multiple pipelines requires many lines of code, and writing it all in the same notebook often makes it long and cluttered. On the other hand, using multiple notebooks makes it harder to compare the results and to keep an overview. On top of that, refactoring the code for every test can be quite time-consuming. How many times have you conducted the same action to pre-process a raw dataset? How many times have you copy-and-pasted code from an old repository to re-use it in a new use case?

                                                                                                                                                                                  Although best practices tell us to start with a simple model and build up to more complicated ones, many data scientists just use the model best known to them in order to avoid the aforementioned problems. This can result in poor performance (because the model is just not the right one for the task) or in inefficient management of time and computing resources (because a simpler/faster model could have achieved a similar performance).

                                                                                                                                                                                  ATOM is here to help solve these common issues. The package acts as a wrapper of the whole machine learning pipeline, helping the data scientist to rapidly find a good model for his problem. Avoid endless imports and documentation lookups. Avoid rewriting the same code over and over again. With just a few lines of code, it's now possible to perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset, providing quick insights on which pipeline performs best for the task at hand.

                                                                                                                                                                                  It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you determine the right pipeline, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance.

                                                                                                                                                                                  Example steps taken by ATOM's pipeline:

                                                                                                                                                                                  1. Data Cleaning
                                                                                                                                                                                    • Handle missing values
                                                                                                                                                                                    • Encode categorical features
                                                                                                                                                                                    • Detect and remove outliers
                                                                                                                                                                                    • Balance the training set
                                                                                                                                                                                  2. Feature engineering
                                                                                                                                                                                    • Create new non-linear features
                                                                                                                                                                                    • Select the most promising features
                                                                                                                                                                                  3. Train and validate multiple models
                                                                                                                                                                                    • Apply hyperparameter tuning
                                                                                                                                                                                    • Fit the models on the training set
                                                                                                                                                                                    • Evaluate the results on the test set
                                                                                                                                                                                  4. Analyze the results
                                                                                                                                                                                    • Get the scores on various metrics
                                                                                                                                                                                    • Make plots to compare the model performances

                                                                                                                                                                                  Figure 1. Diagram of a possible pipeline created by ATOM."}, {"location": "user_guide/logging/", "title": "Logging & Tracking", "text": ""}, {"location": "user_guide/logging/#logging", "title": "Logging", "text": "

                                                                                                                                                                                  To start logging your experiments, fill the logger parameter with the name or path to store the logging file. If automatic naming is used, the file is saved using the __name__ of the class followed by the timestamp of the logger's creation, e.g. ATOMClassifier_11May21_20h11m03s. The logging file contains method calls, all printed messages to stdout with maximum verbosity, and any exception raised during running. Additionally, the logging entries of external libraries are redirected to the same file handler.

                                                                                                                                                                                  "}, {"location": "user_guide/logging/#tracking", "title": "Tracking", "text": "

                                                                                                                                                                                  ATOM uses MLflow Tracking as a backend API and UI for logging models, parameters, pipelines, data and plots. Start tracking your experiments assigning a name to the experiment parameter. Every model is tracked using a separate run. When no backend is configured, the data is stored locally at ./mlruns. To configure the backend, use mlflow.set_tracking_uri in your notebook or IDE before initializing atom. This does not affect the currently active run (if one exists), but takes effect for successive runs. Run mlflow ui on your terminal to open MLflow's Tracking UI and view it at http://localhost:5000.

                                                                                                                                                                                  Note

                                                                                                                                                                                  When using ATOM on Databricks, the experiment's name should include the complete path to the storage, e.g., /Users/username@domain.com/experiment_name.

                                                                                                                                                                                  Example

                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"breast_cancer\")\natom.run(models=[\"LR\", \"RF\", \"LGB\"], n_trials=(0, 0, 10))\n

                                                                                                                                                                                  "}, {"location": "user_guide/logging/#dagshub-integration", "title": "DAGsHub integration", "text": "

                                                                                                                                                                                  ATOM has a build-in integration with DAGsHub, a web platform based on open source tools, optimized for data science and oriented towards the open source community. To store your mlflow experiments in a DAGsHub repo, type dagshub:<experiment_name> in the experiment parameter (instead of just the experiment's name). If the repo does not already exist, a new public repo is created.

                                                                                                                                                                                  Info

                                                                                                                                                                                  If you are logged into your DAGsHub account when initializing atom with a dagshub experiment, a page on your web browser is automatically opened to give access permissions. If not, read here how to set up your DAGsHub credentials.

                                                                                                                                                                                  Example

                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"dagshub:breast_cancer\")\natom.run(models=[\"LR\", \"RF\"])\n

                                                                                                                                                                                  "}, {"location": "user_guide/logging/#tracked-elements", "title": "Tracked elements", "text": "

                                                                                                                                                                                  Tags The runs are automatically tagged with the model's full name, the branch from which the model was trained, and the time it took to fit the model. Add additional custom tags through the ht_params parameter, e.g., atom.run([\"LR\", \"RF\"], ht_params={\"tags\": {\"tag1\": 1}}).

                                                                                                                                                                                  Parameters All parameters used by the estimator at initialization are tracked. Additional parameters passed to the fit method are not tracked.

                                                                                                                                                                                  Model The model's estimator is stored as artifact. The estimator has to be compatible with the mlflow.sklearn, module.

                                                                                                                                                                                  Hyperparameter tuning If hyperparameter tuning is performed, every trial is tracked as a nested run in the model's main run. This option can be switched off using atom's log_ht attribute, e.g., atom.log_ht = False. The data and pipeline options are never stored within nested runs.

                                                                                                                                                                                  Metrics All metric results are tracked, not only during training, but also when the evaluate method is called at a later point. Metrics calculated during in-training validation are also stored.

                                                                                                                                                                                  Dataset The train and test sets used to fit and evaluate the model can be stored as .csv files to the run's artifacts. This option can be switched on using atom's log_data attribute, e.g. atom.log_data = True.

                                                                                                                                                                                  Pipeline The model's pipeline (returned from the export_pipeline method) can be stored as an artifact. This option can be switched on using atom's log_pipeline attribute, e.g., atom.log_pipeline = True.

                                                                                                                                                                                  Plots By default, plots are stored as .html artifacts in all runs corresponding to the models that are showed in the plot. If the filename parameter is specified, they are stored under that name, else the method's name is used. This option can be switched off using atom's log_plots attribute, e.g., atom.log_plots = False.

                                                                                                                                                                                  "}, {"location": "user_guide/models/", "title": "Models", "text": ""}, {"location": "user_guide/models/#predefined-models", "title": "Predefined models", "text": "

                                                                                                                                                                                  ATOM provides many models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, a class containing the underlying estimator is attached to atom as an attribute. We refer to these \"subclasses\" as models. Apart from the estimator, the models contain a variety of attributes and methods that can help you understand how the underlying estimator performed. They can be accessed using their acronyms, e.g., atom.LGB to access the LightGBM model. The available models and their corresponding acronyms are:

                                                                                                                                                                                  • AdaBoost (AdaB)
                                                                                                                                                                                  • ARIMA (Arima)
                                                                                                                                                                                  • AutoARIMA (AutoARIMA)
                                                                                                                                                                                  • AutomaticRelevanceDetermination (ARD)
                                                                                                                                                                                  • Bagging (Bag)
                                                                                                                                                                                  • BayesianRidge (BR)
                                                                                                                                                                                  • BernoulliNB (BNB)
                                                                                                                                                                                  • CatBoost (CatB)
                                                                                                                                                                                  • CategoricalNB (CatNB)
                                                                                                                                                                                  • ComplementNB (CNB)
                                                                                                                                                                                  • DecisionTree (Tree)
                                                                                                                                                                                  • Dummy (Dummy)
                                                                                                                                                                                  • ElasticNet (EN)
                                                                                                                                                                                  • ETS (ETS)
                                                                                                                                                                                  • ExponentialSmoothing (ES)
                                                                                                                                                                                  • ExtraTree (ETree)
                                                                                                                                                                                  • ExtraTrees (ET)
                                                                                                                                                                                  • GaussianNB (GNB)
                                                                                                                                                                                  • GaussianProcess (GP)
                                                                                                                                                                                  • GradientBoostingMachine (GBM)
                                                                                                                                                                                  • HuberRegression (Huber)
                                                                                                                                                                                  • HistGradientBoosting (hGBM)
                                                                                                                                                                                  • KNearestNeighbors (KNN)
                                                                                                                                                                                  • Lasso (Lasso)
                                                                                                                                                                                  • LeastAngleRegression (Lars)
                                                                                                                                                                                  • LightGBM (LGB)
                                                                                                                                                                                  • LinearDiscriminantAnalysis (LDA)
                                                                                                                                                                                  • LinearSVM (lSVM)
                                                                                                                                                                                  • LogisticRegression (LR)
                                                                                                                                                                                  • MultiLayerPerceptron (MLP)
                                                                                                                                                                                  • MultinomialNB (MNB)
                                                                                                                                                                                  • NaiveForecaster (NF)
                                                                                                                                                                                  • OrdinaryLeastSquares (OLS)
                                                                                                                                                                                  • OrthogonalMatchingPursuit (OMP)
                                                                                                                                                                                  • PassiveAggressive (PA)
                                                                                                                                                                                  • Perceptron (Perc)
                                                                                                                                                                                  • PolynomialTrend (PT)
                                                                                                                                                                                  • QuadraticDiscriminantAnalysis (QDA)
                                                                                                                                                                                  • RadiusNearestNeighbors (RNN)
                                                                                                                                                                                  • RandomForest (RF)
                                                                                                                                                                                  • Ridge (Ridge)
                                                                                                                                                                                  • StochasticGradientDescent (SGD)
                                                                                                                                                                                  • SupportVectorMachine (SVM)
                                                                                                                                                                                  • XGBoost (XGB)

                                                                                                                                                                                  Warning

                                                                                                                                                                                  The model classes can not be initialized directly by the user! Use them only through atom.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  The acronyms are case-insensitive, e.g., atom.lgb also calls the LightGBM model.

                                                                                                                                                                                  "}, {"location": "user_guide/models/#custom-models", "title": "Custom models", "text": "

                                                                                                                                                                                  It is also possible to create your own models in ATOM's pipeline. For example, imagine we want to use sklearn's RANSACRegressor estimator (note that is not included in ATOM's predefined models). There are two ways to achieve this:

                                                                                                                                                                                  • Using ATOMModel (recommended). With this approach you can pass the required model characteristics to the pipeline.
                                                                                                                                                                                  >>> from atom import ATOMRegressor, ATOMModel\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> ransac = ATOMModel(RANSACRegressor, name=\"RANSAC\", needs_scaling=True)\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run(ransac)\n
                                                                                                                                                                                  • Using the estimator's class or an instance of the class. This approach will also call ATOMModel under the hood, but it will leave its parameters to their default values.
                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run(RANSACRegressor)\n

                                                                                                                                                                                  Additional things to take into account:

                                                                                                                                                                                  • Custom models can be accessed through their acronym like any other model, e.g. atom.ransac in the example above.
                                                                                                                                                                                  • Custom models are not restricted to sklearn estimators, but they should follow sklearn's API, i.e., have a fit and predict method.
                                                                                                                                                                                  • Parameter customization (for the initializer) is only possible for custom models which provide an estimator that has a set_params() method, i.e., it's a child class of BaseEstimator.
                                                                                                                                                                                  • Hyperparameter tuning for custom models is ignored unless appropriate dimensions are provided through ht_params.

                                                                                                                                                                                  "}, {"location": "user_guide/models/#deep-learning", "title": "Deep learning", "text": "

                                                                                                                                                                                  Deep learning models can be used through ATOM's custom models as long as they follow sklearn's API. For example, models implemented with the Keras package should use the scikeras wrappers KerasClassifier or KerasRegressor.

                                                                                                                                                                                  Many deep learning use cases, for example in computer vision, use datasets with more than 2 dimensions, e.g., image data can have shape (n_samples, length, width, rgb). Luckily, scikeras has a workaround to be able to work with such datasets. Learn with this example how to use ATOM to train and validate a Convolutional Neural Network on an image dataset.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Models implemented with keras can only use custom hyperparameter tuning when n_jobs=1 or ht_params={\"cv\": 1}. Using n_jobs > 1 and cv > 1 raises a PicklingError due to incompatibilities of the APIs.

                                                                                                                                                                                  "}, {"location": "user_guide/models/#ensembles", "title": "Ensembles", "text": "

                                                                                                                                                                                  Ensemble models use multiple estimators to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone. ATOM implements two ensemble techniques: voting and stacking. Click here to see an example that uses ensemble models.

                                                                                                                                                                                  If the ensemble's underlying estimator is a model that used automated feature scaling, it's added as a Pipeline containing the scaler and estimator. If a mlflow experiment is active, the ensembles start their own run, just like the predefined models do.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  "}, {"location": "user_guide/models/#voting", "title": "Voting", "text": "

                                                                                                                                                                                  The idea behind voting is to combine the predictions of conceptually different models to make new predictions. Such a technique can be useful for a set of equally well performing models in order to balance out their individual weaknesses. Read more in sklearn's documentation.

                                                                                                                                                                                  A voting model is created from a trainer through the voting method. The voting model is added automatically to the list of models in the trainer, under the Vote acronym. The underlying estimator is a custom adaptation of VotingClassifier or VotingRegressor depending on the task. The differences between ATOM's and sklearn's implementation are:

                                                                                                                                                                                  • ATOM's implementation doesn't fit estimators if they're already fitted.
                                                                                                                                                                                  • ATOM's instance is considered fitted at initialization when all underlying estimators are.
                                                                                                                                                                                  • ATOM's VotingClassifier doesn't implement a LabelEncoder to encode the target column.

                                                                                                                                                                                  The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. As a consequence of this, the VotingClassifier can not use sklearn's build-in LabelEncoder for the target column since it can't be fitted when initializing the class. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.

                                                                                                                                                                                  "}, {"location": "user_guide/models/#stacking", "title": "Stacking", "text": "

                                                                                                                                                                                  Stacking is a method for combining estimators to reduce their biases. More precisely, the predictions of each individual estimator are stacked together and used as input to a final estimator to compute the prediction. Read more in sklearn's documentation.

                                                                                                                                                                                  A stacking model is created from a trainer through the stacking method. The stacking model is added automatically to the list of models in the trainer, under the Stack acronym. The underlying estimator is a custom adaptation of StackingClassifier or StackingRegressor depending on the task. The only difference between ATOM's and sklearn's implementation is that ATOM's implementation doesn't fit estimators if they're already fitted. The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.

                                                                                                                                                                                  "}, {"location": "user_guide/nlp/", "title": "Natural Language Processing", "text": "

                                                                                                                                                                                  Natural Language Processing (NLP) is the subfield of machine learning that works with human language data. The nlp module contains four classes that help to convert raw text to meaningful numeric values, ready to be ingested by a model. ATOM uses the nltk library for the majority of its NLP processes.

                                                                                                                                                                                  The text documents are expected to be provided in a column of the dataframe named corpus (the name is case-insensitive). Only the corpus is changed by the transformers, leaving the rest of the columns as is. This mechanism allows atom to combine datasets containing a text corpus with other non-text features. If an array is provided as input, it should consist of only one feature containing the text (one document per row). ATOM will then automatically convert the array to a dataframe with the desired column name. Documents are expected to be strings or sequences of words. Click here for an example using text data.

                                                                                                                                                                                  Note

                                                                                                                                                                                  All of atom's NLP methods automatically adopt the relevant transformer attributes (verbose, logger) from atom. A different choice can be added as parameter to the method call, e.g., atom.tokenize(verbose=0).

                                                                                                                                                                                  Info

                                                                                                                                                                                  ATOM doesn't do topic modeling! The module's goal is to help process text documents into features that can be used for supervised learning.

                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#text-cleaning", "title": "Text cleaning", "text": "

                                                                                                                                                                                  Text data is rarely clean. Whether it's scraped from a website or inferred from paper documents, it's always populated with irrelevant information for the model, such as email addresses, HTML tags, numbers or punctuation marks. Use the TextCleaner class to clean the corpus from such noise. It can be accessed from atom through the textclean method. Use the class' parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Decode unicode characters to their ascii representations.
                                                                                                                                                                                  • Convert all characters to lower case.
                                                                                                                                                                                  • Drop email addresses from the text.
                                                                                                                                                                                  • Drop URL links from the text.
                                                                                                                                                                                  • Drop HTML tags from the text.
                                                                                                                                                                                  • Drop emojis from the text.
                                                                                                                                                                                  • Drop numbers from the text.
                                                                                                                                                                                  • Drop punctuations from the text.

                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#tokenization", "title": "Tokenization", "text": "

                                                                                                                                                                                  Some text processing algorithms, like stemming or lemmatization, require the corpus to be made out of tokens, instead of strings, in order to know what to consider as words. Tokenization is used to achieve this. It separates every document into a sequence of smaller units. In this case, the words.

                                                                                                                                                                                  Sometimes, words have a different meaning on their own than when combined with adjacent words. For example, the word new has a completely different meaning when the word york is directly after it than when it's not. These combinations of two words are called bigrams. When there are three words, they are called trigrams, and with four words quadgrams.

                                                                                                                                                                                  The Tokenizer class converts a document into a sequence of words, and can create the most frequent bigrams, trigrams and quadgrams. It can be accessed from atom through the tokenize method.

                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#text-normalization", "title": "Text Normalization", "text": "

                                                                                                                                                                                  Normalization for texts is a process that converts a list of words to a more uniform standard. This is useful to reduce the amount of different information that the computer has to deal with, and therefore improves efficiency. The goal of normalization techniques like stemming and lemmatization is to reduce inflectional and related forms of a word to a common base form.

                                                                                                                                                                                  Normalize the words in the corpus using the TextNormalizer class. It can be accessed from atom through the textnormalize method.

                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#vectorization", "title": "Vectorization", "text": "

                                                                                                                                                                                  Text data cannot be fed directly to the algorithms themselves, as most of them expect numerical feature vectors with a fixed size, rather than words in the text documents with variable length. Vectorization is the general process of turning a collection of text documents into numerical feature vectors. You can apply it to the corpus using the Vectorizer class. It can be accessed from atom through the vectorize method.

                                                                                                                                                                                  Info

                                                                                                                                                                                  All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.

                                                                                                                                                                                  Bag of Words The Bag of Words (BOW) strategy applies tokenization, counting and normalization to the corpus. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document. The created columns are named with the words they are embedding with the prefix corpus_. Read more in sklearn's documentation.

                                                                                                                                                                                  TF-IDF In a large text corpus, some words will be very present (e.g., \u201cthe\u201d, \u201ca\u201d, \u201cis\u201d in English), hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier, those very frequent terms would shadow the frequencies of rarer, yet more interesting, terms. Use the TF-IDF strategy to re-weight the count features into floating point values. The created columns are named with the words they are embedding with the prefix corpus_. Read more in sklearn's documentation.

                                                                                                                                                                                  Hashing The larger the corpus, the larger the vocabulary will grow and thus increasing the number of features and memory use. Use the Hashing strategy to hash the words to a specified number of features. The created features are named hash0, hash1, etc... Read more in sklearn's documentation.

                                                                                                                                                                                  "}, {"location": "user_guide/nomenclature/", "title": "Nomenclature", "text": "

                                                                                                                                                                                  This documentation consistently uses terms to refer to certain concepts related to this package. The most frequent terms are described hereunder.

                                                                                                                                                                                  ATOM

                                                                                                                                                                                  Refers to this package.

                                                                                                                                                                                  atom

                                                                                                                                                                                  Instance of the ATOMClassifier, ATOMForecaster or ATOMRegressor classes (note that the examples use it as the default variable name).

                                                                                                                                                                                  A pipeline, corresponding dataset and models fitted to that dataset. See the branches section of the user guide.

                                                                                                                                                                                  categorical columns

                                                                                                                                                                                  Refers to all columns of type object or category.

                                                                                                                                                                                  class

                                                                                                                                                                                  Unique value in a column, e.g., a binary classifier has 2 classes in the target column.

                                                                                                                                                                                  dataframe

                                                                                                                                                                                  Two-dimensional, size-mutable, potentially heterogeneous tabular data of type pd.DataFrame or its modin counterpart.

                                                                                                                                                                                  dataframe-like

                                                                                                                                                                                  Any type object from which a dataframe can be created. This includes an iterable, a dict whose values are 1d-arrays, a two-dimensional list, tuple, np.ndarray or sps.csr_matrix, and most commonly, a dataframe. This is the standard input format for any dataset.

                                                                                                                                                                                  Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and you are performing parallel operations, since it can avoid broadcasting a large dataset from the driver to the workers.

                                                                                                                                                                                  estimator

                                                                                                                                                                                  An object which manages the estimation and decoding of an algorithm. The algorithm is estimated as a deterministic function of a set of parameters, a dataset and a random state. Should implement a fit method. Often used interchangeably with predictor because of user preference.

                                                                                                                                                                                  index

                                                                                                                                                                                  Immutable sequence used for indexing and alignment of type pd.Index or their modin counterpart.

                                                                                                                                                                                  missing values

                                                                                                                                                                                  All values in the missing attribute, as well as None, NaN, +inf and -inf.

                                                                                                                                                                                  model

                                                                                                                                                                                  Instance of a model in atom. Not to confuse with estimator.

                                                                                                                                                                                  outliers

                                                                                                                                                                                  Sample that contains one or more outlier values. Note that the Pruner class can use a different definition for outliers depending on the chosen strategy.

                                                                                                                                                                                  outlier value

                                                                                                                                                                                  Value that lies further than 3 times the standard deviation away from the mean of its column, i.e., |z-score| > 3.

                                                                                                                                                                                  predictor

                                                                                                                                                                                  An estimator implementing a predict method.

                                                                                                                                                                                  scorer

                                                                                                                                                                                  A non-estimator callable object which evaluates an estimator on given test data, returning a number. Unlike evaluation metrics, a greater returned number must correspond with a better score. See sklearn's documentation.

                                                                                                                                                                                  segment

                                                                                                                                                                                  Subset (segment) of a sequence, whether through slicing or generating a range of values. When given as a parameter type, it includes both range and slice.

                                                                                                                                                                                  sequence

                                                                                                                                                                                  A one-dimensional, indexable array of type sequence (except string), np.ndarray, index or series. This is the standard input format for a dataset's target column.

                                                                                                                                                                                  series

                                                                                                                                                                                  One-dimensional ndarray with axis labels of type pd.Series or its modin counterpart.

                                                                                                                                                                                  target

                                                                                                                                                                                  The dependent variable in a supervised learning task. Passed as y to an estimator's fit method.

                                                                                                                                                                                  task

                                                                                                                                                                                  One of the supervised machine learning approaches that ATOM supports:

                                                                                                                                                                                  • binary classification
                                                                                                                                                                                  • multiclass classification
                                                                                                                                                                                  • multilabel classification
                                                                                                                                                                                  • multiclass-multioutput classification
                                                                                                                                                                                  • regression
                                                                                                                                                                                  • multioutput regression
                                                                                                                                                                                  • univariate forecast
                                                                                                                                                                                  • multivariate forecast
                                                                                                                                                                                  transformer

                                                                                                                                                                                  An estimator implementing a transform method. This encompasses all data cleaning and feature engineering classes.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/", "title": "Plots", "text": "

                                                                                                                                                                                  ATOM provides many plotting methods to analyze the data or compare the model performances. Descriptions and examples can be found in the API section. ATOM mainly uses the plotly library for plotting. Plotly makes interactive, publication-quality graphs that are rendered using html. Some plots require other libraries like matplotlib, shap, wordcloud and schemdraw.

                                                                                                                                                                                  Plots that compare model performances (methods with the models parameter) can be called directly from atom, e.g., atom.plot_roc(), or from one of the models, e.g., atom.adab.plot_roc(). If called from atom, use the models parameter to specify which models to plot. If called from a specific model, it makes the plot only for that model and the models parameter becomes unavailable.

                                                                                                                                                                                  Plots that analyze the data (methods without the models parameter) can only be called from atom, and not from the models.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#parameters", "title": "Parameters", "text": "

                                                                                                                                                                                  Apart from the plot-specific parameters, all plots have five parameters in common:

                                                                                                                                                                                  • The title parameter adds a title to the plot. The default value doesn't show any title. Provide a configuration (as dictionary) to customize its appearance, e.g., title=dict(text=\"Awesome plot\", color=\"red\"). Read more in plotly's documentation.
                                                                                                                                                                                  • The legend parameter is used to show/hide, position or customize the plot's legend. Provide a configuration (as dictionary) to customize its appearance (e.g., legend=dict(title=\"Title for legend\", title_font_color=\"red\")) or choose one of the following locations:

                                                                                                                                                                                    • upper left
                                                                                                                                                                                    • upper right
                                                                                                                                                                                    • lower left
                                                                                                                                                                                    • lower right
                                                                                                                                                                                    • upper center
                                                                                                                                                                                    • lower center
                                                                                                                                                                                    • center left
                                                                                                                                                                                    • center right
                                                                                                                                                                                    • center
                                                                                                                                                                                    • out: Position the legend outside the axis, on the right hand side. This is plotly's default position. Note that this shrinks the size of the axis to fit both legend and axes in the specified figsize.
                                                                                                                                                                                  • The figsize parameter adjust the plot's size.

                                                                                                                                                                                  • The filename parameter is used to save the plot.
                                                                                                                                                                                  • The display parameter determines whether to show or return the plot.

                                                                                                                                                                                  Info

                                                                                                                                                                                  In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the rows parameter for every line you want to draw, e.g., atom.plot_roc(rows=(\"train\", \"test\")), or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200}). Note that for these methods, using atom.plot_roc(rows=\"train+test\"), only plots one line with the data from both sets. See the advanced plotting example.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#aesthetics", "title": "Aesthetics", "text": "

                                                                                                                                                                                  The plot's aesthetics can be customized using the plot attributes prior to calling the plotting method, e.g., atom.title_fontsize = 30. The default values are:

                                                                                                                                                                                  • palette: [\"rgb(0, 98, 98)\", \"rgb(56, 166, 165)\", \"rgb(115, 175, 72)\", \"rgb(237, 173, 8)\", \"rgb(225, 124, 5)\", \"rgb(204, 80, 62)\", \"rgb(148, 52, 110)\", \"rgb(111, 64, 112)\", \"rgb(102, 102, 102)\"]
                                                                                                                                                                                  • title_fontsize: 24
                                                                                                                                                                                  • label_fontsize: 16
                                                                                                                                                                                  • tick_fontsize: 12

                                                                                                                                                                                  Use atom's update_layout method to further customize the plot's layout using any of plotly's layout properties, e.g., atom.update_layout(template=\"plotly_dark\"). Similarly, use the update_traces method to customize the traces properties, e.g. atom.update_traces(mode=\"lines+markers\").

                                                                                                                                                                                  The reset_aesthetics method allows you to reset all aesthetics to their default value. See advanced plotting for an example.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#canvas", "title": "Canvas", "text": "

                                                                                                                                                                                  Use the canvas method to draw multiple plots side by side, for example to make it easier to compare similar results. The canvas method is a @contextmanager, i.e., it's used through Python's with command. Plots in a canvas ignore the legend, figsize, filename and display parameters. Instead, specify these parameters in the canvas. If a variable is assigned to the canvas (e.g., with atom.canvas() as fig), it yields the resulting figure.

                                                                                                                                                                                  For example, we can use a canvas to compare the results of a XGBoost and LightGBM model on the train and test set. We could also draw the lines for both models in the same axes, but that would clutter the plot too much. Click here for more examples.

                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.run([\"XGB\", \"LGB\"])\n\n>>> with atom.canvas(2, 2, title=\"XGBoost vs LightGBM\"):\n...     atom.xgb.plot_roc(rows=\"train+test\", title=\"ROC - XGBoost\")\n...     atom.lgb.plot_roc(rows=\"train+test\", title=\"ROC - LightGBM\")\n...     atom.xgb.plot_prc(rows=\"train+test\", title=\"PRC - XGBoost\")\n...     atom.lgb.plot_prc(rows=\"train+test\", title=\"PRC - LightGBM\")\n

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#shap", "title": "SHAP", "text": "

                                                                                                                                                                                  The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 7 of SHAP's plotting functions directly from its API. A list of available shap plots can be found here.

                                                                                                                                                                                  Calculating the Shapley values is computationally expensive, especially for model agnostic explainers like Permutation. To avoid having to recalculate the values for every plot, ATOM stores the shapley values internally after the first calculation, and access them later when needed again.

                                                                                                                                                                                  Note

                                                                                                                                                                                  Since the plot figures are not made by ATOM, note the following:

                                                                                                                                                                                  • It's not possible to draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_force().
                                                                                                                                                                                  • The returned plot is a matplotlib figure, not plotly's.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#available-plots", "title": "Available plots", "text": "

                                                                                                                                                                                  A list of available plots can be found hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#data-plots", "title": "Data plots", "text": "

                                                                                                                                                                                  plot_componentsPlot the explained variance ratio per component.plot_correlationPlot a correlation matrix.plot_distributionPlot column distributions.plot_ngramsPlot n-gram frequencies.plot_pcaPlot the explained variance ratio vs number of components.plot_qqPlot a quantile-quantile plot.plot_relationshipsPlot pairwise relationships in a dataset.plot_rfecvPlot the rfecv results.plot_wordcloudPlot a wordcloud from the corpus.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#hyperparameter-tuning-plots", "title": "Hyperparameter tuning plots", "text": "

                                                                                                                                                                                  plot_edfPlot the Empirical Distribution Function of a study.plot_hyperparameter_importancePlot a model's hyperparameter importance.plot_hyperparametersPlot hyperparameter relationships in a study.plot_parallel_coordinatePlot high-dimensional parameter relationships in a study.plot_pareto_frontPlot the Pareto front of a study.plot_slicePlot the parameter relationship in a study.plot_terminator_improvementPlot the potentials for future objective improvement.plot_timelinePlot the timeline of a study.plot_trialsPlot the hyperparameter tuning trials.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#prediction-plots", "title": "Prediction plots", "text": "

                                                                                                                                                                                  plot_calibrationPlot the calibration curve for a binary classifier.plot_confusion_matrixPlot a model's confusion matrix.plot_detPlot the Detection Error Tradeoff curve.plot_errorsPlot a model's prediction errors.plot_evalsPlot evaluation curves.plot_feature_importancePlot a model's feature importance.plot_forecastPlot a time series with model forecasts.plot_gainsPlot the cumulative gains curve.plot_learning_curvePlot the learning curve: score vs number of training samples.plot_liftPlot the lift curve.plot_parshapPlot the partial correlation of shap values.plot_partial_dependencePlot the partial dependence of features.plot_permutation_importancePlot the feature permutation importance of models.plot_pipelinePlot a diagram of the pipeline.plot_prcPlot the precision-recall curve.plot_probabilitiesPlot the probability distribution of the target classes.plot_residualsPlot a model's residuals.plot_resultsPlot the model results.plot_rocPlot the Receiver Operating Characteristics curve.plot_successive_halvingPlot scores per iteration of the successive halving.plot_thresholdPlot metric performances against threshold values.

                                                                                                                                                                                  "}, {"location": "user_guide/plots/#shap-plots", "title": "Shap plots", "text": "

                                                                                                                                                                                  plot_shap_barPlot SHAP's bar plot.plot_shap_beeswarmPlot SHAP's beeswarm plot.plot_shap_decisionPlot SHAP's decision plot.plot_shap_forcePlot SHAP's force plot.plot_shap_heatmapPlot SHAP's heatmap plot.plot_shap_scatterPlot SHAP's scatter plot.plot_shap_waterfallPlot SHAP's waterfall plot.

                                                                                                                                                                                  "}, {"location": "user_guide/predicting/", "title": "Predicting", "text": "

                                                                                                                                                                                  After training a model, you probably want to make predictions on new, unseen data. Just like a sklearn estimator, you can call the prediction methods from the model, e.g., atom.tree.predict(X).

                                                                                                                                                                                  All prediction methods transform the provided data through the pipeline in the model's branch before making the predictions. Transformers that should only be applied on the training set are excluded from this step (e.g., outlier pruning or class balancing).

                                                                                                                                                                                  The available prediction methods are the standard methods for estimators in sklearn's and sktime's API.

                                                                                                                                                                                  For classification and regression tasks:

                                                                                                                                                                                  decision_functionGet confidence scores on new data or existing rows.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.scoreGet a metric score on new data.

                                                                                                                                                                                  For forecast tasks:

                                                                                                                                                                                  predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.scoreGet a metric score on new data.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  The score method return atom's metric score, not the metric returned by sklearn/sktime's score method for estimators. Use the method's metric parameter to calculate a different metric.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • The output of ATOM's methods are pandas objects, not numpy arrays.
                                                                                                                                                                                  • The predict_proba method of some meta-estimators for multioutput tasks (such as MultioutputClassifier) return 3 dimensions, namely, a list of arrays with shape=(n_samples, n_classes). One array per target column. Since ATOM's prediction methods return pandas objects, such 3-dimensional arrays are converted to a multiindex pd.DataFrame, where the first level of the row indices are the target columns, and the second level are the classes.
                                                                                                                                                                                  • The prediction results are cached after the first call to avoid consequent expensive calculations. This mechanism can increase the size of the instance for large datasets. Use the clear method to free the memory.

                                                                                                                                                                                  It's also possible to get the prediction for a specific row or rows in the dataset. See the row and column selection section in the user guide to learn how to select the rows, e.g., atom.rf.predict(\"test\") or atom.rf.predict_proba(range(100)).

                                                                                                                                                                                  Note

                                                                                                                                                                                  For forecast models, prediction on rows follow the ForecastingHorizon API. That means that using the row index works, but for example using atom.arima.predict(1) returns the prediction on the first row of the test set (instead of the second row of the train set).

                                                                                                                                                                                  "}, {"location": "user_guide/time_series/", "title": "Time series", "text": ""}, {"location": "user_guide/time_series/#forecast", "title": "Forecast", "text": ""}, {"location": "user_guide/time_series/#time-series-classification", "title": "Time series classification", "text": ""}, {"location": "user_guide/time_series/#time-series-regression", "title": "Time series regression", "text": ""}, {"location": "user_guide/training/", "title": "Training", "text": "

                                                                                                                                                                                  The training phase is where the models are fitted on the training data. After this, you can use the plots and prediction methods to evaluate the results. The training applies the following steps for all models:

                                                                                                                                                                                  1. Use hyperparameter tuning to select the optimal hyperparameters for the model (optional).
                                                                                                                                                                                  2. The model is fitted on the training set using the best combination of hyperparameters found. After that, the model is evaluated on the tes set.
                                                                                                                                                                                  3. Calculate various scores on the test set using a bootstrap algorithm (optional).

                                                                                                                                                                                  There are three approaches to run the training.

                                                                                                                                                                                  • Direct training:
                                                                                                                                                                                    • DirectClassifier
                                                                                                                                                                                    • DirectForecaster
                                                                                                                                                                                    • DirectRegressor
                                                                                                                                                                                  • Training via successive halving:
                                                                                                                                                                                    • SuccessiveHalvingClassifier
                                                                                                                                                                                    • SuccessiveHalvingForecaster
                                                                                                                                                                                    • SuccessiveHalvingRegressor
                                                                                                                                                                                  • Training via train sizing:
                                                                                                                                                                                    • TrainSizingClassifier
                                                                                                                                                                                    • TrainSizingForecaster
                                                                                                                                                                                    • TrainSizingRegressor

                                                                                                                                                                                  The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Just like the data cleaning and feature engineering classes, it's discouraged to use these classes directly. Instead, every approach can be called directly from atom through the run, successive_halving and train_sizing methods respectively.

                                                                                                                                                                                  Models are called through their acronyms, e.g., atom.run(models=\"RF\") will train a RandomForest. If you want to run the same model multiple times, add a tag after the acronym to differentiate them. the tag must be separated from the accronym by an underscore.

                                                                                                                                                                                  atom.run(\n    models=[\"RF_1\", \"RF_2\"],\n    est_params={\n        \"RF_1\": {\"n_estimators\": 100},\n        \"RF_2\": {\"n_estimators\": 200},\n    }\n)\n

                                                                                                                                                                                  For example, this pipeline fits two Random Forest models, one with 100 and the other with 200 decision trees. The models can be accessed through atom.rf_1 and atom.rf_2. Use tagged models to test how the same model performs when fitted with different parameters or on different data sets. See the Imbalanced datasets example.

                                                                                                                                                                                  Additional things to take into account:

                                                                                                                                                                                  • If an exception is encountered while fitting an estimator, the pipeline will automatically jump to the next model. The exceptions are stored in the errors attribute. Note that when a model is skipped, there is no model subclass for that estimator.
                                                                                                                                                                                  • When showing the final results, a ! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set).

                                                                                                                                                                                  "}, {"location": "user_guide/training/#metric", "title": "Metric", "text": "

                                                                                                                                                                                  ATOM uses sklearn's scorers for model evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties , such as if a higher or lower score is better (score or loss function) or if the function needs probability estimates or rounded predictions (see the make_scorer function). The metric parameter accepts three ways of defining the scorer:

                                                                                                                                                                                  • Using the name of one of the predefined scorers.
                                                                                                                                                                                  • Using a function with signature function(y_true, y_pred) -> score. In this case, ATOM uses make_scorer with default parameters.
                                                                                                                                                                                  • Using a scorer object.

                                                                                                                                                                                  Note that all scorers follow the convention that higher return values are better than lower return values. Thus, metrics which measure the distance between the model and the data (i.e., loss functions), like max_error or mean_squared_error, will return the negated value of the metric.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#predefined-scorers", "title": "Predefined scorers", "text": "

                                                                                                                                                                                  ATOM accepts all sklearn's scorers as well as some custom acronyms and custom scorers. Since some of sklearn's scorers have quite long names and ATOM is all about lazyfast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case-insensitive and can be used in the metric parameter instead of the scorer's full name, e.g., atom.run(\"LR\", metric=\"BA\") uses balanced_accuracy. The available acronyms are:

                                                                                                                                                                                  • \"AP\" for \"average_precision\"
                                                                                                                                                                                  • \"BA\" for \"balanced_accuracy\"
                                                                                                                                                                                  • \"AUC\" for \"roc_auc\"
                                                                                                                                                                                  • \"LogLoss\" for \"neg_log_loss\"
                                                                                                                                                                                  • \"EV\" for \"explained_variance\"
                                                                                                                                                                                  • \"ME\" for \"max_error\"
                                                                                                                                                                                  • \"MAE\" for \"neg_mean_absolute_error\"
                                                                                                                                                                                  • \"MSE\" for \"neg_mean_squared_error\"
                                                                                                                                                                                  • \"RMSE\" for \"neg_root_mean_squared_error\"
                                                                                                                                                                                  • \"MSLE\" for \"neg_mean_squared_log_error\"
                                                                                                                                                                                  • \"MEDAE\" for \"neg_median_absolute_error\"
                                                                                                                                                                                  • \"MAPE\" for \"neg_mean_absolute_percentage_error\"
                                                                                                                                                                                  • \"POISSON\" for \"neg_mean_poisson_deviance\"
                                                                                                                                                                                  • \"GAMMA\" for \"neg_mean_gamma_deviance\"

                                                                                                                                                                                  ATOM also provides some extra common metrics for binary classification tasks.

                                                                                                                                                                                  • \"TN\" for True Negatives
                                                                                                                                                                                  • \"FP\" for False Positives
                                                                                                                                                                                  • \"FN\" for False Negatives
                                                                                                                                                                                  • \"TP\" for True Positives
                                                                                                                                                                                  • \"FPR\" for False Positive rate (fall-out)
                                                                                                                                                                                  • \"TPR\" for True Positive Rate (sensitivity, recall)
                                                                                                                                                                                  • \"TNR\" for True Negative Rate (specificity)
                                                                                                                                                                                  • \"FNR\" for False Negative Rate (miss rate)
                                                                                                                                                                                  • \"MCC\" for Matthews Correlation Coefficient (also for multiclass classification)

                                                                                                                                                                                  "}, {"location": "user_guide/training/#multi-metric-runs", "title": "Multi-metric runs", "text": "

                                                                                                                                                                                  Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g., atom.run(\"LDA\", metric=[\"r2\", \"mse\"]).

                                                                                                                                                                                  When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.score_train could return [0.8734, 0.6672, 0.9001]. Only the first metric of a multi-metric run (this metric is called the main metric) is used to select the winning model.

                                                                                                                                                                                  Info

                                                                                                                                                                                  • The winning model is retrieved comparing only the main metric.
                                                                                                                                                                                  • Some plots let you choose which of the metrics in a multi-metric run to show using the metric parameter, e.g., plot_results.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#automated-feature-scaling", "title": "Automated feature scaling", "text": "

                                                                                                                                                                                  Models that require feature scaling will automatically do so before training, unless the data is sparse or already scaled. The data is considered scaled if it has one of the following prerequisites:

                                                                                                                                                                                  • The mean value over the mean of all columns lies between -0.05 and 0.05 and the mean of the standard deviation over all columns lies between 0.85 and 1.15. Categorical and binary columns (only 0s and 1s) are excluded from the calculation.
                                                                                                                                                                                  • There is a transformer in the pipeline whose __name__ contains the word scaler.

                                                                                                                                                                                  The scaling is applied using a Scaler with default parameters. It can be accessed from the model through the scaler attribute. The scaled dataset can be examined through the model's data attributes. Use the available_models method to see which models require feature scaling. See here an example.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#in-training-validation", "title": "In-training validation", "text": "

                                                                                                                                                                                  Some predefined models allow in-training validation. This means that the estimator is evaluated (using only the main metric) on the train and test set after every round of the training (a round can be an iteration for linear models or an added tree for boosted tree models). The validation scores are stored in the evals attribute, a dictionary of the train and test performances per round (also when pruning isn't applied). Click here for an example using in-training validation.

                                                                                                                                                                                  The predefined models that support in-training validation are:

                                                                                                                                                                                  • CatBoost
                                                                                                                                                                                  • LightGBM
                                                                                                                                                                                  • MultiLayerPerceptron
                                                                                                                                                                                  • PassiveAggressive
                                                                                                                                                                                  • Perceptron
                                                                                                                                                                                  • StochasticGradientDescent
                                                                                                                                                                                  • XGBoost

                                                                                                                                                                                  To apply in-training validation to a custom model, use the has_validation parameter when creating the custom model.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • In-training validation is not calculated during hyperparameter tuning.
                                                                                                                                                                                  • CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the use_best_model=False parameter to avoid this behavior or use a holdout set to evaluate the final estimator.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_evals method to visualize the in-training validation on the train and test sets.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#parameter-customization", "title": "Parameter customization", "text": "

                                                                                                                                                                                  By default, every estimator uses the default parameters they get from their respective packages. To select different ones, use the est_params. parameter of the run method. There are two ways to add custom parameters to the models: adding them directly to the dictionary as key-value pairs or through dictionaries.

                                                                                                                                                                                  Adding the parameters directly to est_params (or using a dict with the key 'all') shares them across all models in the trainer. In this example, both the XGBoost and the LightGBM model use 200 boosted trees. Make sure all the models do have the specified parameters or an exception will be raised!

                                                                                                                                                                                  atom.run(models=[\"XGB\", \"LGB\"], est_params={\"n_estimators\": 200})\n

                                                                                                                                                                                  To specify parameters per model, use the model name as key and a dict of the parameters as value. In this example, the XGBoost model uses n_estimators=200 and the MultiLayerPerceptron uses one hidden layer with 75 neurons.

                                                                                                                                                                                  atom.run(\n    models=[\"XGB\", \"MLP\"],\n    est_params={\n        \"XGB\": {\"n_estimators\": 200},\n        \"MLP\": {\"hidden_layer_sizes\": (75,)},\n    }\n)\n

                                                                                                                                                                                  Some estimators allow you to pass extra parameters to the fit method (besides X and y). This can be done adding _fit at the end of the parameter. For example, to change XGBoost's verbosity, we can run:

                                                                                                                                                                                  atom.run(models=\"XGB\", est_params={\"verbose_fit\": True})\n

                                                                                                                                                                                  Note

                                                                                                                                                                                  If a parameter is specified through est_params, it's ignored by the study, even if it's added manually to ht_params[\"distributions\"].

                                                                                                                                                                                  Info

                                                                                                                                                                                  The estimator's n_jobs and random_state parameters adopt atom's values (when available), unless specified through est_params.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#hyperparameter-tuning", "title": "Hyperparameter tuning", "text": "

                                                                                                                                                                                  In order to achieve maximum performance, it's important to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning through the optuna package. Just like optuna, we use the terms study and trial as follows:

                                                                                                                                                                                  • Study: optimization based on an objective function.
                                                                                                                                                                                  • Trial: a single execution of the objective function.

                                                                                                                                                                                  Each trial is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub)training and validation set. This process can create some minimum data leakage towards specific parameters (since the estimator is evaluated on data that is used to train the next estimator), but it ensures maximal use of the provided data. However, the leakage is not present in the independent test set, thus the final score of every model is unbiased. Note that, if the dataset is relatively small, the tuning's best score can consistently be lower than the final score on the test set due to the considerable lower fraction of instances on which it is trained. After finishing the study, the parameters that resulted in the best score are used to fit the final model on the complete training set.

                                                                                                                                                                                  Info

                                                                                                                                                                                  • Unless specified differently by the user, the used samplers are TPESampler for single-metric runs and NSGAIISampler for multi-metric runs.
                                                                                                                                                                                  • For multi-metric runs, the selected best trial is the trial that performed best on the main metric. Use the property's @setter to change it to any other trial. See the hyperparameter tuning example.

                                                                                                                                                                                  There are many possibilities to tune the study to your liking. The main parameter is n_trials, which determine the number of trials that are performed.

                                                                                                                                                                                  Extra things to take into account:

                                                                                                                                                                                  • The train/validation splits are different per trial but equal for all models.
                                                                                                                                                                                  • Re-evaluating the objective function at the same point (with the same hyperparameters) automatically skips the calculation and returns the same score as the equivalent trial.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  The hyperparameter tuning output can become quite wide for models with many hyperparameters. If you are working in a Jupyter Notebook, you can change the output's width running the following code in a cell:

                                                                                                                                                                                  from IPython.display import display, HTML\ndisplay(HTML(\"<style>.container { width:100% !important; }</style>\"))\n

                                                                                                                                                                                  Other settings can be changed through the ht_params parameter, a dictionary where every key-value combination can be used to further customize the optimization.

                                                                                                                                                                                  By default, which hyperparameters are tuned and their corresponding distributions are predefined by ATOM. Use the 'distributions' key to customize these. Just like with est_params, it's possible to share the same parameters across models or use a dictionary with the model name as key to specify the parameters for every individual model. Use the key 'all' to tune some hyperparameters for all models when you also want to tune other parameters only for specific ones. The following example tunes the n_estimators parameter for both models but the max_depth parameter only for the RandomForest.

                                                                                                                                                                                  atom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\"distributions\": {\"all\": \"n_estimators\", \"RF\": \"max_depth\"}},\n)\n

                                                                                                                                                                                  Like the columns parameter in atom's methods, you can exclude parameters from the optimization adding ! before its name. It's possible to exclude multiple parameters, but not to combine inclusion and exclusion for the same model. For example, to optimize a RandomForest using all its predefined parameters except n_estimators, run:

                                                                                                                                                                                  atom.run(\n    models=\"ET\",\n    n_trials=15,\n    ht_params={\"distributions\": \"!n_estimators\"},\n)\n

                                                                                                                                                                                  If just the parameter name is provided, the predefined distribution is used. It's also possible to provide custom distributions spaces, but make sure they are compliant with optuna's API. See every model's individual documentation in ATOM's API section for an overview of their hyperparameters and distributions.

                                                                                                                                                                                  from optuna.distributions import (\n    IntDistribution, FloatDistribution, CategoricalDistribution\n)\n\natom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\n        \"dimensions\": {\n            \"all\": {\"n_estimators\": IntDistribution(10, 100, step=10)},\n            \"RF\": {\n                \"max_depth\": IntDistribution(1, 10),\n                \"max_features\": CategoricalDistribution([\"sqrt\", \"log2\"]),\n           },\n        },\n    }\n)\n

                                                                                                                                                                                  Parameters for optuna's study and the study's optimize method can be added as kwargs to ht_params. For example, to use a different sampler or add a custom callback.

                                                                                                                                                                                  from optuna.samplers import RandomSampler\n\natom.run(\n    models=\"LR\",\n    n_trials=30,\n    ht_params={\n        \"sampler\": RandomSampler(seed=atom.random_state),\n        \"callbacks\": custom_callback(),\n    },\n)\n

                                                                                                                                                                                  Note

                                                                                                                                                                                  • If you use the default sampler, it\u2019s recommended to consider setting larger n_trials to make full use of the characteristics of TPESampler because TPESampler uses some (by default, 10) trials for its startup.
                                                                                                                                                                                  • When specifying distributions manually, make sure to import the distribution types from optuna: from optuna.distributions import ....

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Keras' models can only use hyperparameter tuning when n_jobs=1 or ht_params={\"cv\": 1}. Using n_jobs > 1 and cv > 1 raises a PicklingError due to incompatibilities of the APIs. Read here more about deep learning models.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  ATOM has several plots that can help you examine a model's study and trials. Have a look at them here.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#pruning", "title": "Pruning", "text": "

                                                                                                                                                                                  During hyperparameter tuning, pruning stops unpromising trials at the early stages of the training (a.k.a., automated early-stopping). This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to yield the best results. A pruned trial can't be selected as best_trial. Click here to see an example that uses pruning.

                                                                                                                                                                                  The study uses MedianPruner as default pruner. You can use any other of optuna's pruners through the ht_params parameter.

                                                                                                                                                                                  from optuna.pruners import HyperbandPruner\n\natom.run(\"SGD\", n_trials=30, ht_params={\"pruner\": HyperbandPruner()})\n

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • Pruning is disabled for multi-metric runs.
                                                                                                                                                                                  • Pruning is only available for models that support in-training validation.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#bootstrapping", "title": "Bootstrapping", "text": "

                                                                                                                                                                                  After fitting the estimator, you can assess the robustness of the model using the bootstrap technique. This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way you can get a distribution of the performance of the model. The sets are the same for every model. The number of sets can be chosen through the n_bootstrap parameter.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_results method to plot the boostrap scores in a boxplot.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#successive-halving", "title": "Successive halving", "text": "

                                                                                                                                                                                  Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g., only using tree-based models.

                                                                                                                                                                                  Run successive halving from atom via the successive_halving method. Consecutive runs of the same model are saved with the model's acronym followed by the number of models in the run. For example, a RandomForest in a run with 4 models would become model RF4.

                                                                                                                                                                                  See here a successive halving example.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.

                                                                                                                                                                                  "}, {"location": "user_guide/training/#train-sizing", "title": "Train sizing", "text": "

                                                                                                                                                                                  When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

                                                                                                                                                                                  Run train sizing from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. Consecutive runs of the same model are saved with the model's acronym followed by the fraction of rows in the training set (the . is removed from the fraction!). For example, a RandomForest in a run with 80% of the training samples would become model RF08.

                                                                                                                                                                                  See here a train sizing example.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_learning_curve method to see the model's performance per size of the training set.

                                                                                                                                                                                  "}]} +{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "about/", "title": "About", "text": ""}, {"location": "about/#what-is-it", "title": "What is it?", "text": "

                                                                                                                                                                                  Automated Tool for Optimized Modeling (ATOM) is an open-source Python package designed to help data scientists fasten up the exploration phase of their machine learning projects. ATOM is a low-code, easy-to-use library, capable of running experiments quickly and efficiently, enabling the user to go from raw data to generating insights in just a few lines of code. Click here to get started.

                                                                                                                                                                                  "}, {"location": "about/#what-can-i-do-with-it", "title": "What can I do with it?", "text": "

                                                                                                                                                                                  ATOM is an end-to-end solution for machine learning pipelines. It supports the user from raw data ingestion to the final results' analysis and model deployment. Click on the icons to read more about its main functionalities.

                                                                                                                                                                                  Data cleaning Feature engineering Model selection Hyperparametertuning Model training Model predictions Experiment logging Analysis &Interpretability"}, {"location": "about/#who-is-it-intended-for", "title": "Who is it intended for?", "text": "
                                                                                                                                                                                  • Data scientists that want to fasten up the exploration phase of their machine learning projects.
                                                                                                                                                                                  • Data scientists that want to run a simple modeling experiment without having to spend too much time on coding.
                                                                                                                                                                                  • Data scientists that are new to Python and are not (yet) familiar with all the relevant machine learning packages.
                                                                                                                                                                                  • Data analysts without extensive knowledge of machine learning that want to try out model-based solutions.
                                                                                                                                                                                  • Anyone who wants to rapidly build a Proof of Concept, for example during a hackathon.
                                                                                                                                                                                  • Anyone who is new to the field of machine learning and wants a low-code, easy to learn package, to get started building predictive pipelines.
                                                                                                                                                                                  "}, {"location": "about/#citing-atom", "title": "Citing ATOM", "text": "

                                                                                                                                                                                  If you use ATOM in a scientific publication, please consider citing this documentation page as the resource. ATOM\u2019s first stable release v2.0.3 was made publicly available in November 2019. A formatted version of the citation would look like this:

                                                                                                                                                                                  ATOM v2.0.3, November 2019. URL https://tvdboom.github.io/ATOM/

                                                                                                                                                                                  BibTeX entry:

                                                                                                                                                                                  @Manual{ATOM,\n    title = {ATOM: A Python package for fast exploration of machine learning pipelines},\n    author = {Mavs},\n    year={2019},\n    mont={November},\n    note = {ATOM version 2.0.3},\n    url = {https://tvdboom.github.io/ATOM/},\n}\n

                                                                                                                                                                                  "}, {"location": "about/#support", "title": "Support", "text": "

                                                                                                                                                                                  ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.

                                                                                                                                                                                  "}, {"location": "about/#integrations", "title": "Integrations", "text": ""}, {"location": "contributing/", "title": "Contributing", "text": "

                                                                                                                                                                                  Are you interested in contributing to ATOM? Do you want to report a bug? Do you have a question? Before you do, please read the following guidelines.

                                                                                                                                                                                  "}, {"location": "contributing/#submission-context", "title": "Submission context", "text": ""}, {"location": "contributing/#question-or-problem", "title": "Question or problem?", "text": "

                                                                                                                                                                                  For quick questions, there's no need to open an issue. Check first if the question isn't already answered in the FAQ section. If not, reach us through the discussions page or on the slack channel.

                                                                                                                                                                                  "}, {"location": "contributing/#report-a-bug", "title": "Report a bug?", "text": "

                                                                                                                                                                                  If you found a bug in the source code, you can help by submitting an issue to the issue tracker in the GitHub repository. Even better, you can submit a Pull Request with a fix. However, before doing so, please read the submission guidelines.

                                                                                                                                                                                  "}, {"location": "contributing/#missing-a-feature", "title": "Missing a feature?", "text": "

                                                                                                                                                                                  You can request a new feature by submitting an issue to the GitHub Repository. If you would like to implement a new feature, please submit an issue with a proposal for your work first. Please consider what kind of change it is:

                                                                                                                                                                                  • For a major feature, first open an issue and outline your proposal so that it can be discussed. This will also allow us to better coordinate our efforts, prevent duplication of work, and help you to craft the change so that it is successfully accepted into the project.

                                                                                                                                                                                  • Small features and bugs can be crafted and directly submitted as a Pull Request. However, there is no guarantee that your feature will make it into master, as it's always a matter of opinion whether if benefits the overall functionality of the project.

                                                                                                                                                                                  "}, {"location": "contributing/#project-layout", "title": "Project layout", "text": "

                                                                                                                                                                                  The latest stable release of ATOM is on the master branch, whereas the latest version of ATOM in development is on the development branch. Make sure you are looking at and working on the correct branch if you're looking to contribute code.

                                                                                                                                                                                  In terms of directory structure:

                                                                                                                                                                                  • All of ATOM's code sources are in the atom directory.
                                                                                                                                                                                  • The documentation sources are in the docs_sources directory.
                                                                                                                                                                                  • Images in the documentation are in the docs_sources/img directory.
                                                                                                                                                                                  • Tutorial notebooks are in the examples directory. If you want to include the example to the documentation as well, add the .ipynb file to docs_sources/examples and update the mkdocs.yml file accordingly.
                                                                                                                                                                                  • Unit tests are in the tests directory. Make sure to add the tests to the file corresponding to the module in the atom directory with the code that is being tested.

                                                                                                                                                                                  Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.

                                                                                                                                                                                  "}, {"location": "contributing/#submission-guidelines", "title": "Submission guidelines", "text": ""}, {"location": "contributing/#submitting-an-issue", "title": "Submitting an issue", "text": "

                                                                                                                                                                                  Before you submit an issue, please search the issue tracker, maybe an issue for your problem already exists, and the discussion might inform you of workarounds readily available.

                                                                                                                                                                                  We want to fix all the issues as soon as possible, but before fixing a bug we need to reproduce and confirm it. In order to reproduce bugs we will systematically ask you to provide a minimal reproduction scenario using the custom issue template.

                                                                                                                                                                                  "}, {"location": "contributing/#submitting-a-pull-request", "title": "Submitting a pull request", "text": "

                                                                                                                                                                                  Before you submit a pull request, please work through this checklist to make sure that you have done the necessary so we can efficiently review and accept your changes.

                                                                                                                                                                                  • Update the documentation so all of your changes are reflected there.
                                                                                                                                                                                  • Adhere to PEP 8 standards.
                                                                                                                                                                                  • Use a maximum of 91 characters per line. Try to keep docstrings below 74 characters.
                                                                                                                                                                                  • Update the project unit tests to test your code changes as thoroughly as possible.
                                                                                                                                                                                  • Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices.
                                                                                                                                                                                  • Run isort: isort atom tests.
                                                                                                                                                                                  • Run flake8: flake8 --show-source --statistics atom tests.
                                                                                                                                                                                  • Run pydocstyle: pydocstyle atom tests.
                                                                                                                                                                                  • Run mypy: mypy atom tests.

                                                                                                                                                                                  If your contribution requires a new library dependency:

                                                                                                                                                                                  • Double-check that the new dependency is easy to install via pip and Anaconda.
                                                                                                                                                                                  • The library should support Python 3.10 and 3.11.
                                                                                                                                                                                  • Make sure the code works with the latest version of the library.
                                                                                                                                                                                  • Update the dependencies in the documentation.
                                                                                                                                                                                  • Add the library with the minimum required version to pyproject.toml.

                                                                                                                                                                                  After submitting your pull request, GitHub will automatically run the tests on your changes and make sure that the updated code builds successfully. The checks run on Python 3.10 and 3.11, on Ubuntu and Windows. We also use services that automatically check code style and test coverage.

                                                                                                                                                                                  "}, {"location": "dependencies/", "title": "Dependencies", "text": ""}, {"location": "dependencies/#python-os", "title": "Python & OS", "text": "

                                                                                                                                                                                  As of the moment, ATOM supports the following Python versions:

                                                                                                                                                                                  • Python 3.10
                                                                                                                                                                                  • Python 3.11

                                                                                                                                                                                  And operating systems:

                                                                                                                                                                                  • Linux (Ubuntu, Fedora, etc...)
                                                                                                                                                                                  • Windows 8.1+
                                                                                                                                                                                  • macOS (not tested)

                                                                                                                                                                                  "}, {"location": "dependencies/#packages", "title": "Packages", "text": ""}, {"location": "dependencies/#required", "title": "Required", "text": "

                                                                                                                                                                                  ATOM is built on top of several existing Python libraries. These packages are necessary for its correct functioning.

                                                                                                                                                                                  • beartype (>=0.16.4)
                                                                                                                                                                                  • category-encoders (>=2.6.3)
                                                                                                                                                                                  • dagshub (>=0.3.8)
                                                                                                                                                                                  • dill (>=0.3.6)
                                                                                                                                                                                  • gplearn (>=0.4.2)
                                                                                                                                                                                  • imbalanced-learn (>=0.11.0)
                                                                                                                                                                                  • ipython (>=8.11.0)
                                                                                                                                                                                  • ipywidgets (>=8.1.1)
                                                                                                                                                                                  • featuretools (>=1.28.0)
                                                                                                                                                                                  • joblib (>=1.3.1)
                                                                                                                                                                                  • matplotlib (>=3.7.2)
                                                                                                                                                                                  • mlflow (>=2.7.1)
                                                                                                                                                                                  • modin[ray] (>=0.25.0)
                                                                                                                                                                                  • nltk (>=3.8.1)
                                                                                                                                                                                  • numpy (>=1.23.0)
                                                                                                                                                                                  • optuna (>=3.4.0)
                                                                                                                                                                                  • pandas[parquet] (>=2.1.2)
                                                                                                                                                                                  • plotly (>=5.15.0)
                                                                                                                                                                                  • ray[serve] (>=2.7.1)
                                                                                                                                                                                  • scikit-learn (>=1.3.1)
                                                                                                                                                                                  • scikit-learn-intelex (>=2023.2.1)
                                                                                                                                                                                  • scipy (>=1.10.1)
                                                                                                                                                                                  • shap (>=0.43.0)
                                                                                                                                                                                  • sktime (>=0.24.0)
                                                                                                                                                                                  • zoofs (>=0.1.26)
                                                                                                                                                                                  "}, {"location": "dependencies/#optional", "title": "Optional", "text": "

                                                                                                                                                                                  Some specific models, utility methods or plots require the installation of additional libraries. You can install all the optional dependencies using pip install atom-ml[full]. Doing so also installs the following libraries:

                                                                                                                                                                                  • botorch (>=0.8.5)
                                                                                                                                                                                  • catboost (>=1.2)
                                                                                                                                                                                  • explainerdashboard (>=0.4.3)
                                                                                                                                                                                  • gradio (>=3.44.4)
                                                                                                                                                                                  • lightgbm (>=4.1.0)
                                                                                                                                                                                  • pmdarima (>=2.0.3)
                                                                                                                                                                                  • schemdraw (>=0.16)
                                                                                                                                                                                  • sweetviz (>=2.3.1)
                                                                                                                                                                                  • wordcloud (>=1.9.2)
                                                                                                                                                                                  • xgboost (>=2.0.0)
                                                                                                                                                                                  "}, {"location": "dependencies/#development", "title": "Development", "text": "

                                                                                                                                                                                  The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to contribute to the project. Install them running pdm install --dev (don't forget to install pdm with pip install -U pdm).

                                                                                                                                                                                  Linting

                                                                                                                                                                                  • isort (>=5.12.0)
                                                                                                                                                                                  • flake8 (>=6.0.0)
                                                                                                                                                                                  • flake8-pyproject (>=1.2.3)
                                                                                                                                                                                  • pydocstyle (>=6.3.0)
                                                                                                                                                                                  • mypy (>=1.6.1)
                                                                                                                                                                                  • pandas_stubs (>=2.1.1.230928)
                                                                                                                                                                                  • types-requests (>=2.31.0.10)

                                                                                                                                                                                  Testing

                                                                                                                                                                                  • nbmake (>=1.4.1)
                                                                                                                                                                                  • pytest (>=7.2.1)
                                                                                                                                                                                  • pytest-cov (>=4.0.0)
                                                                                                                                                                                  • pytest-xdist (>=3.2.0)
                                                                                                                                                                                  • scikeras (>=0.11.0)
                                                                                                                                                                                  • tensorflow (>=2.13.0)

                                                                                                                                                                                  Documentation

                                                                                                                                                                                  • jupyter-contrib-nbextensions (>=0.7.0)
                                                                                                                                                                                  • mike (>=1.1.2)
                                                                                                                                                                                  • mkdocs (>=1.5.3)
                                                                                                                                                                                  • mkdocs-autorefs (>=0.5.0)
                                                                                                                                                                                  • mkdocs-jupyter (>=0.24.6)
                                                                                                                                                                                  • mkdocs-material (>=9.4.7)
                                                                                                                                                                                  • mkdocs-simple-hooks (>=0.1.5)
                                                                                                                                                                                  • pymdown-extensions (>=10.3.1)
                                                                                                                                                                                  • pyyaml (>=6.0)
                                                                                                                                                                                  "}, {"location": "faq/", "title": "Frequently asked questions", "text": "

                                                                                                                                                                                  Here we try to give answers to some questions that have popped up regularly. If you have any other questions, don't hesitate to create a new discussion or post them on the Slack channel!

                                                                                                                                                                                  ??? faq Is this package related to the Atom text editor?\" There is, indeed, a text editor with the same name and a similar logo as this package. Is this a shameless copy? No. When I started the project, I didn't know about the text editor, and it doesn't require much thinking to come up with the idea of replacing the letter O of the word atom with the image of an atom.

                                                                                                                                                                                  How does ATOM relate to AutoML?

                                                                                                                                                                                  ATOM is not an AutoML tool since it does not automate the search for an optimal pipeline like well-known AutoML tools such as auto-sklearn or EvalML do. Instead, ATOM helps the user find the optimal pipeline himself. One of the goals of this package is to help data scientists produce explainable pipelines, and using an AutoML black box function would impede that.

                                                                                                                                                                                  Is it possible to run deep learning models?

                                                                                                                                                                                  Yes. Deep learning models can be added as custom models to the pipeline as long as they follow sklearn's API. For more information, see the deep learning section of the user guide.

                                                                                                                                                                                  Can I run atom's methods on just a subset of the columns?

                                                                                                                                                                                  Yes, all data cleaning and feature engineering methods accept a columns parameter to only transform the selected features. For example, to only impute the numerical columns in the dataset we could type atom.impute(strat_num=\"mean\", columns=atom.numerical). The parameter accepts column names, column indices, dtypes or a slice object.

                                                                                                                                                                                  How can I compare the same model on different datasets?

                                                                                                                                                                                  In many occasions you might want to test how a model performs on datasets processed with different pipelines. For this, atom has the branch system. Create a new branch for every new pipeline you want to test and use the plot methods to compare all models, independent of the branch it was trained on.

                                                                                                                                                                                  Can I train models through atom using a GPU?

                                                                                                                                                                                  Yes. Refer to the user guide to see what algorithms and models have a GPU implementation. Be aware that it could require additional software and hardware dependencies.

                                                                                                                                                                                  How are numerical and categorical columns differentiated?

                                                                                                                                                                                  The columns are separated using a dataframe's select_dtypes method. Numerical columns are selected using include=\"number\" whereas categorical columns are selected using exclude=\"number\".

                                                                                                                                                                                  Can I run unsupervised learning pipelines?

                                                                                                                                                                                  No. As for now, ATOM only supports supervised machine learning pipelines. However, various unsupervised algorithms can be chosen as strategy in the Pruner class to detect and remove outliers from the dataset.

                                                                                                                                                                                  Is there a way to plot multiple models in the same shap plot?

                                                                                                                                                                                  No. Unfortunately, there is no way to plot multiple models in the same shap plot since the plots are made by the shap package and passed as matplotlib.axes objects to atom. This means that it's not within the reach of this package to implement such a utility.

                                                                                                                                                                                  Can I merge a sklearn pipeline with atom?

                                                                                                                                                                                  Yes. Like any other transformer, it is possible to add a sklearn pipeline to atom using the add method. Every transformer in the pipeline is merged independently. The pipeline is not allowed to end with a model since atom manages its own models. If that is the case, add the pipeline using atom.add(pipeline[:-1]).

                                                                                                                                                                                  Is it possible to initialize atom with an existing train and test set?

                                                                                                                                                                                  Yes. If you already have a separated train and test set you can initialize atom in two ways:

                                                                                                                                                                                  • atom = ATOMClassifier(train, test)
                                                                                                                                                                                  • atom = ATOMClassifier((X_train, y_train), (X_test, y_test))

                                                                                                                                                                                  Make sure the train and test size have the same number of columns! If atom is initialized in any of these two ways, the test_size parameter is ignored.

                                                                                                                                                                                  Can I train the models using cross-validation?

                                                                                                                                                                                  Applying cross-validation means transforming every step of the pipeline multiple times, each with different results. Doing this would prevent ATOM from being able to show the transformation results after every pre-processing step, which means losing the ability to inspect how a transformer changed the dataset. For this reason, it is not possible to apply cross-validation until after a model has been trained. After a model has been trained, the pipeline is defined, and cross-validation can be applied using the cross_validate method. See here an example using cross-validation.

                                                                                                                                                                                  Is there a way to process datetime features?

                                                                                                                                                                                  Yes, the FeatureExtractor class can automatically extract useful features (day, month, year, etc...) from datetime columns. The extracted features are always encoded to numerical values, so they can be fed directly to a model.

                                                                                                                                                                                  "}, {"location": "getting_started/", "title": "Getting started", "text": ""}, {"location": "getting_started/#installation", "title": "Installation", "text": "

                                                                                                                                                                                  Install ATOM's newest release easily via pip:

                                                                                                                                                                                  pip install -U atom-ml\n

                                                                                                                                                                                  or via conda:

                                                                                                                                                                                  conda install -c conda-forge atom-ml\n

                                                                                                                                                                                  Note

                                                                                                                                                                                  Since atom was already taken, download the package under the name atom-ml!

                                                                                                                                                                                  Warning

                                                                                                                                                                                  ATOM makes use of many other ML libraries, making its dependency list quite long. Because of that, the installation may take longer than you are accustomed to. Be patient!

                                                                                                                                                                                  Optional dependencies

                                                                                                                                                                                  Some specific models, utility methods or plots require the installation of additional libraries. To install the optional dependencies, add [full] after the package's name.

                                                                                                                                                                                  pip install -U atom-ml[full]\n

                                                                                                                                                                                  Latest source

                                                                                                                                                                                  Sometimes, new features and bug fixes are already implemented in the development branch, but waiting for the next release to be made available. If you can't wait for that, it's possible to install the package directly from git.

                                                                                                                                                                                  pip install git+https://github.com/tvdboom/ATOM.git@development#egg=atom-ml\n

                                                                                                                                                                                  Don't forget to include #egg=atom-ml to explicitly name the project, this way pip can track metadata for it without having to have run the setup.py script.

                                                                                                                                                                                  Contributing

                                                                                                                                                                                  If you are planning to contribute to the project, you'll need the development dependencies. Install them adding [dev] after the package's name.

                                                                                                                                                                                  pip install -U atom-ml[dev]\n

                                                                                                                                                                                  Click here for a complete list of package files for all versions published on PyPI.

                                                                                                                                                                                  "}, {"location": "getting_started/#usage", "title": "Usage", "text": "

                                                                                                                                                                                  ATOM contains a variety of classes and functions to perform data cleaning, feature engineering, model training, plotting and much more. The easiest way to use everything ATOM has to offer is through one of the main classes:

                                                                                                                                                                                  • ATOMClassifier for classification tasks.
                                                                                                                                                                                  • ATOMForecaster for forecasting tasks.
                                                                                                                                                                                  • ATOMRegressor for regression tasks.

                                                                                                                                                                                  Let's walk you through an example. Click on the SageMaker Studio Lab badge on top of this section to run this example yourself.

                                                                                                                                                                                  Make the necessary imports and load the data.

                                                                                                                                                                                  >>> import pandas as pd\n>>> from atom import ATOMClassifier\n\n>>> # Load the Australian Weather dataset\n>>> X = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)\n>>> print(X.head())\n\n           Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm RainToday  RainTomorrow\n0  MelbourneAirport     18.0     26.9      21.4          7.0       8.9         SSE           41.0          W        SSE           9.0          20.0         95.0         54.0       1019.5       1017.0       8.0       5.0     18.5     26.0       Yes             0\n1          Adelaide     17.2     23.4       0.0          NaN       NaN           S           41.0          S        WSW          13.0          19.0         59.0         36.0       1015.7       1015.7       NaN       NaN     17.7     21.9        No             0\n2            Cairns     18.6     24.6       7.4          3.0       6.1         SSE           54.0        SSE         SE          26.0          35.0         78.0         57.0       1018.7       1016.6       3.0       3.0     20.8     24.1       Yes             0\n3          Portland     13.6     16.8       4.2          1.2       0.0         ESE           39.0        ESE        ESE          17.0          15.0         76.0         74.0       1021.4       1020.5       7.0       8.0     15.6     16.0       Yes             1\n4           Walpole     16.4     19.9       0.0          NaN       NaN          SE           44.0         SE         SE          19.0          30.0         78.0         70.0       1019.4       1018.9       NaN       NaN     17.4     18.1        No             0\n

                                                                                                                                                                                  Initialize the ATOMClassifier or ATOMRegressor class. These two classes are convenient wrappers for the whole machine learning pipeline. Contrary to sklearn's API, they are initialized providing the data you want to manipulate.

                                                                                                                                                                                  >>> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (100, 22)\nTrain set size: 80\nTest set size: 20\n-------------------------------------\nMemory: 17.73 kB\nScaled: False\nMissing values: 193 (8.8%)\nCategorical features: 5 (23.8%)\n

                                                                                                                                                                                  Data transformations are applied through atom's methods. For example, calling the impute method will initialize an Imputer instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (no fit and transform commands necessary).

                                                                                                                                                                                  >>> atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \n\nFitting Imputer...\nImputing missing values...\n --> Imputing 1 missing values with median (0.0) in feature Rainfall.\n --> Imputing 36 missing values with median (4.8) in feature Evaporation.\n --> Imputing 38 missing values with median (8.45) in feature Sunshine.\n --> Imputing 8 missing values with most_frequent (SSE) in feature WindGustDir.\n --> Imputing 8 missing values with median (41.0) in feature WindGustSpeed.\n --> Imputing 7 missing values with most_frequent (ESE) in feature WindDir9am.\n --> Imputing 2 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 1 missing values with median (74.0) in feature Humidity9am.\n --> Imputing 6 missing values with median (1017.55) in feature Pressure9am.\n --> Imputing 6 missing values with median (1015.4) in feature Pressure3pm.\n --> Imputing 38 missing values with median (5.5) in feature Cloud9am.\n --> Imputing 40 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 1 missing values with median (17.2) in feature Temp9am.\n --> Imputing 1 missing values with most_frequent (No) in feature RainToday.\n\n>>> atom.encode(strategy=\"Target\", max_onehot=8)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 42 classes.\n   --> Handling 2 unknown classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n   --> Handling 1 unknown classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n

                                                                                                                                                                                  Similarly, models are trained and evaluated using the run method. Here, we fit both a LogisticRegression and LinearDiscriminantAnalysis model, and apply hyperparameter tuning.

                                                                                                                                                                                  >>> atom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)\n\n\nTraining ========================= >>\nModels: LR, LDA\nMetric: auc\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |      l2 |  1.1302 |     sag |      730 |      0.3 |  0.5417 |   0.5417 |     0.093s |  0.093s | COMPLETE |\n| 1     |    None |  0.1544 |   lbfgs |      120 |      0.5 |  0.8542 |   0.8542 |     0.092s |  0.185s | COMPLETE |\n| 2     |      l2 |  0.0027 |     sag |      460 |      0.4 |  0.5625 |   0.8542 |     0.090s |  0.275s | COMPLETE |\n| 3     |      l2 |  0.0062 |   lbfgs |      800 |      0.8 |  0.6042 |   0.8542 |     0.090s |  0.365s | COMPLETE |\n| 4     | elast.. |  4.2724 |    saga |      530 |      0.1 |  0.6042 |   0.8542 |     0.096s |  0.461s | COMPLETE |\n| 5     |      l2 |  1.3274 | newto.. |      680 |      0.3 |  0.5625 |   0.8542 |     0.093s |  0.555s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 1\nBest parameters:\n --> penalty: None\n --> C: 0.1544\n --> solver: lbfgs\n --> max_iter: 120\n --> l1_ratio: 0.5\nBest evaluation --> auc: 0.8542\nTime elapsed: 0.555s\nFit ---------------------------------------------\nTrain evaluation --> auc: 1.0\nTest evaluation --> auc: 0.4133\nTime elapsed: 0.074s\n-------------------------------------------------\nTime: 0.629s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |     svd |      None |  0.6458 |   0.6458 |     0.086s |  0.086s | COMPLETE |\n| 1     |    lsqr |       0.7 |  0.9375 |   0.9375 |     0.081s |  0.167s | COMPLETE |\n| 2     |     svd |       nan |  0.6458 |   0.9375 |     0.001s |  0.168s | COMPLETE |\n| 3     |    lsqr |       0.8 |   0.625 |   0.9375 |     0.079s |  0.247s | COMPLETE |\n| 4     |     svd |       nan |  0.6458 |   0.9375 |     0.000s |  0.247s | COMPLETE |\n| 5     |   eigen |       0.8 |    0.75 |   0.9375 |     0.078s |  0.326s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 1\nBest parameters:\n --> solver: lsqr\n --> shrinkage: 0.7\nBest evaluation --> auc: 0.9375\nTime elapsed: 0.326s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.8576\nTest evaluation --> auc: 0.8933\nTime elapsed: 0.016s\n-------------------------------------------------\nTime: 0.342s\n\n\nFinal results ==================== >>\nTotal time: 1.005s\n-------------------------------------\nLogisticRegression         --> auc: 0.4133 ~\nLinearDiscriminantAnalysis --> auc: 0.8933 !\n

                                                                                                                                                                                  And lastly, analyze the results.

                                                                                                                                                                                  >>> print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR       0.60  0.2793  0.4000  0.0000      0.0 -0.2425       0.00     0.0  0.4667\nLDA      0.85  0.7944  0.7667  0.6667      0.5  0.5774       0.75     0.6  0.9067\n\n\n>>> atom.plot_lift()\n
                                                                                                                                                                                  "}, {"location": "license/", "title": "MIT License", "text": "

                                                                                                                                                                                  Copyright \u00a9 2023 Mavs

                                                                                                                                                                                  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

                                                                                                                                                                                  The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

                                                                                                                                                                                  THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/", "title": "ATOMClassifier", "text": "

                                                                                                                                                                                  class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for classification tasks.

                                                                                                                                                                                  Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

                                                                                                                                                                                  All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  This parameter is ignored if the target column is provided through arrays.

                                                                                                                                                                                  index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

                                                                                                                                                                                  • If False: Reset to RangeIndex.
                                                                                                                                                                                  • If True: Use the provided index.
                                                                                                                                                                                  • If int: Position of the column to use as index.
                                                                                                                                                                                  • If str: Name of the column to use as index.
                                                                                                                                                                                  • If sequence: Array with shape=(n_samples,) to use as index.

                                                                                                                                                                                  test_size: int or float, default=0.2

                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the test set.
                                                                                                                                                                                  • If >1: Number of rows to include in the test set.

                                                                                                                                                                                  This parameter is ignored if the test set is provided through arrays.

                                                                                                                                                                                  holdout_size: int, float or None, default=None

                                                                                                                                                                                  • If None: No holdout data set is kept apart.
                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the holdout set.
                                                                                                                                                                                  • If >1: Number of rows to include in the holdout set.

                                                                                                                                                                                  This parameter is ignored if the holdout set is provided through arrays.

                                                                                                                                                                                  shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

                                                                                                                                                                                  stratify: bool, int, str or sequence, default=True Handle stratification of the target classes over the data sets.

                                                                                                                                                                                  • If False: The data is split randomly.
                                                                                                                                                                                  • If True: The data is stratified over the target column.
                                                                                                                                                                                  • Else: Name or position of the columns to stratify by. The columns can't contain NaN values.

                                                                                                                                                                                  This parameter is ignored if shuffle=False or if the test set is provided through arrays.

                                                                                                                                                                                  For multioutput tasks, stratification is applied to the joint target columns.

                                                                                                                                                                                  n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

                                                                                                                                                                                  • If <=1: Fraction of the dataset to select.
                                                                                                                                                                                  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 180 (1.3%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n>>> atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 22 features from the dataset.\n   --> Dropping feature mean area (rank 7).\n   --> Dropping feature mean compactness (rank 2).\n   --> Dropping feature mean fractal dimension (rank 6).\n   --> Dropping feature smoothness error (rank 9).\n   --> Dropping feature concave points error (rank 4).\n   --> Dropping feature fractal dimension error (rank 8).\n   --> Dropping feature worst radius (rank 3).\n   --> Dropping feature worst area (rank 5).\n\n\n>>> # Train models\n>>> atom.run(models=[\"LR\", \"RF\", \"XGB\"])\n\n\nTraining ========================= >>\nModels: LR, RF, XGB\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9878\nTest evaluation --> f1: 0.9859\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9714\nTime elapsed: 0.251s\n-------------------------------------------------\nTime: 0.251s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9718\nTime elapsed: 0.412s\n-------------------------------------------------\nTime: 0.412s\n\n\nFinal results ==================== >>\nTotal time: 0.759s\n-------------------------------------\nLogisticRegression --> f1: 0.9859 !\nRandomForest       --> f1: 0.9714\nXGBoost            --> f1: 0.9718\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     f1_train  f1_test  time_fit      time\nLR     0.9878   0.9859  0.086078  0.086078\nRF     1.0000   0.9714  0.251238  0.251238\nXGB    1.0000   0.9718  0.412373  0.412373\n\n\n>>> print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR     0.9823  0.9975  0.9811  0.9859   0.9722  0.9621     0.9859  0.9859  0.9960\nRF     0.9646  0.9704  0.9670  0.9714   0.9444  0.9256     0.9855  0.9577  0.9670\nXGB    0.9646  0.9622  0.9621  0.9718   0.9452  0.9242     0.9718  0.9718  0.9621\n
                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#magic-methods", "title": "Magic methods", "text": "

                                                                                                                                                                                  The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

                                                                                                                                                                                  • __repr__: Prints an overview of atom's branches, models and metric.
                                                                                                                                                                                  • __len__: Returns the length of the dataset.
                                                                                                                                                                                  • __iter__: Iterate over the pipeline's transformers.
                                                                                                                                                                                  • __contains__: Checks if the provided item is a column in the dataset.
                                                                                                                                                                                  • __getitem__: Access a branch, model, column or subset of the dataset.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

                                                                                                                                                                                  A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

                                                                                                                                                                                  These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

                                                                                                                                                                                  This property is unavailable for sparse datasets. classes: DataFrameDistribution of target classes per data set.

                                                                                                                                                                                  This property is only available for classification tasks. n_classes: int | numpy.integer | Series | modin.pandas.series.SeriesNumber of classes in the target column(s).

                                                                                                                                                                                  This property is only available for classification tasks.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesbranch: BranchCurrent active branch.

                                                                                                                                                                                  Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#utility-methods", "title": "Utility methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

                                                                                                                                                                                  addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

                                                                                                                                                                                  If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
                                                                                                                                                                                  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transform method doesn't return a dataframe:

                                                                                                                                                                                  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
                                                                                                                                                                                  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

                                                                                                                                                                                  Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

                                                                                                                                                                                  train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the transformer's fit method.

                                                                                                                                                                                  method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

                                                                                                                                                                                  This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

                                                                                                                                                                                  Note

                                                                                                                                                                                  This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

                                                                                                                                                                                  Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

                                                                                                                                                                                  inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

                                                                                                                                                                                  kw_args: dict or None, default=None Additional keyword arguments for the function.

                                                                                                                                                                                  inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

                                                                                                                                                                                  Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to plot a column's distribution.

                                                                                                                                                                                  Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

                                                                                                                                                                                  Returnspd.DataFrame Statistic results with multiindex levels:

                                                                                                                                                                                  • dist: Name of the distribution.
                                                                                                                                                                                  • stat: Statistic results:
                                                                                                                                                                                    • score: KS-test score.
                                                                                                                                                                                    • p_value: Corresponding p-value.

                                                                                                                                                                                  method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

                                                                                                                                                                                  ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This method can be slow for large datasets.

                                                                                                                                                                                  Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to report.
                                                                                                                                                                                  • If sequence: Names of two data sets to compare.
                                                                                                                                                                                  • If dict: Names of up to two data sets with corresponding selection of rows to report.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

                                                                                                                                                                                  The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsseries Sequence of weights with shape=(n_samples,).

                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                  function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

                                                                                                                                                                                  If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

                                                                                                                                                                                  Info

                                                                                                                                                                                  The loaded instance's current branch is the same branch as it was when saved.

                                                                                                                                                                                  Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

                                                                                                                                                                                  data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsatom Unpickled atom instance.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset(hard=False)[source]Reset the instance to it's initial state.

                                                                                                                                                                                  Deletes all branches and models. The dataset is also reset to its form after initialization.

                                                                                                                                                                                  Parametershard: bool, default=False If True, flushes completely the cache.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

                                                                                                                                                                                  **kwargs Additional keyword arguments for pandas' to_csv method.

                                                                                                                                                                                  method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

                                                                                                                                                                                  Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

                                                                                                                                                                                  Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

                                                                                                                                                                                  int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

                                                                                                                                                                                  str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

                                                                                                                                                                                  dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method stats()[source]Display basic information about the dataset.

                                                                                                                                                                                  method status()[source]Get an overview of the branches and models.

                                                                                                                                                                                  This method prints the same information as the __repr__ and also saves it to the logger.

                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#data-cleaning", "title": "Data cleaning", "text": "

                                                                                                                                                                                  The data cleaning methods can help you scale the data, handle missing values, categorical columns, outliers and unbalanced datasets. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

                                                                                                                                                                                  balanceBalance the number of rows per class in the target column.cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

                                                                                                                                                                                  method balance(strategy=\"adasyn\", **kwargs)[source]Balance the number of rows per class in the target column.

                                                                                                                                                                                  When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set.

                                                                                                                                                                                  See the Balancer class for a description of the parameters.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The balance method does not support multioutput tasks.
                                                                                                                                                                                  • This transformation is only applied to the training set in order to maintain the original distribution of target classes in the test set.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom's classes attribute for an overview of the target class distribution per data set.

                                                                                                                                                                                  method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

                                                                                                                                                                                  Use the parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Convert dtypes to the best possible types.
                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                  • Remove characters from column names.
                                                                                                                                                                                  • Strip categorical features from spaces.
                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                  • Encode the target column (ignored for regression tasks).

                                                                                                                                                                                  See the Cleaner class for a description of the parameters.

                                                                                                                                                                                  method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

                                                                                                                                                                                  For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

                                                                                                                                                                                  See the Discretizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to visualize a column's distribution and decide on the bins.

                                                                                                                                                                                  method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

                                                                                                                                                                                  The encoding type depends on the number of classes in the column:

                                                                                                                                                                                  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
                                                                                                                                                                                  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
                                                                                                                                                                                  • If n_classes > max_onehot, use strategy-encoding.

                                                                                                                                                                                  Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

                                                                                                                                                                                  See the Encoder class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the categorical attribute for a list of the categorical features in the dataset.

                                                                                                                                                                                  method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

                                                                                                                                                                                  Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

                                                                                                                                                                                  See the Imputer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the nans attribute to check the amount of missing values per column.

                                                                                                                                                                                  method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

                                                                                                                                                                                  See the Normalizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to examine a column's distribution.

                                                                                                                                                                                  method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

                                                                                                                                                                                  Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

                                                                                                                                                                                  See the Pruner class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the outliers attribute to check the number of outliers per column.

                                                                                                                                                                                  method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

                                                                                                                                                                                  Apply one of sklearn's scalers. Categorical columns are ignored.

                                                                                                                                                                                  See the Scaler class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the scaled attribute to check whether the dataset is scaled.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#nlp", "title": "NLP", "text": "

                                                                                                                                                                                  The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

                                                                                                                                                                                  textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

                                                                                                                                                                                  method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

                                                                                                                                                                                  Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the TextCleaner class for a description of the parameters.

                                                                                                                                                                                  method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

                                                                                                                                                                                  Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

                                                                                                                                                                                  See the TextNormalizer class for a description of the parameters.

                                                                                                                                                                                  method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

                                                                                                                                                                                  Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the Tokenizer class for a description of the parameters.

                                                                                                                                                                                  method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

                                                                                                                                                                                  Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

                                                                                                                                                                                  See the Vectorizer class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#feature-engineering", "title": "Feature engineering", "text": "

                                                                                                                                                                                  To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

                                                                                                                                                                                  feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

                                                                                                                                                                                  method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

                                                                                                                                                                                  Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

                                                                                                                                                                                  See the FeatureExtractor class for a description of the parameters.

                                                                                                                                                                                  method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

                                                                                                                                                                                  Create new combinations of existing features to capture the non-linear relations between the original features.

                                                                                                                                                                                  See the FeatureGenerator class for a description of the parameters.

                                                                                                                                                                                  method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

                                                                                                                                                                                  Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

                                                                                                                                                                                  See the FeatureGrouper class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

                                                                                                                                                                                  method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

                                                                                                                                                                                  Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

                                                                                                                                                                                  See the FeatureSelector class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
                                                                                                                                                                                  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomclassifier/#training", "title": "Training", "text": "

                                                                                                                                                                                  The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

                                                                                                                                                                                  runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the DirectClassifier or DirectRegressor class for a description of the parameters.

                                                                                                                                                                                  method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

                                                                                                                                                                                  The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

                                                                                                                                                                                  method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/", "title": "ATOMForecaster", "text": "

                                                                                                                                                                                  class atom.api.ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for forecasting tasks.

                                                                                                                                                                                  Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

                                                                                                                                                                                  All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Dataset containing exogeneous features and time series. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • y
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Exogeneous feature set corresponding to y, with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Time series.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  y: int, str, dict, sequence or dataframe, default=-1 Time series.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  This parameter is ignored if the time series is provided through arrays.

                                                                                                                                                                                  test_size: int or float, default=0.2

                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the test set.
                                                                                                                                                                                  • If >1: Number of rows to include in the test set.

                                                                                                                                                                                  This parameter is ignored if the test set is provided through arrays.

                                                                                                                                                                                  holdout_size: int, float or None, default=None

                                                                                                                                                                                  • If None: No holdout data set is kept apart.
                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the holdout set.
                                                                                                                                                                                  • If >1: Number of rows to include in the holdout set.

                                                                                                                                                                                  This parameter is ignored if the holdout set is provided through arrays.

                                                                                                                                                                                  n_rows: int or float, default=1 Subsample of the dataset to use. The cut is made from the head of the dataset (older entries are dropped when sorted by date ascending). The default value selects all rows.

                                                                                                                                                                                  • If <=1: Fraction of the dataset to select.
                                                                                                                                                                                  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> # Initialize atom\n>>> atom = ATOMForecaster(y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Univariate forecast.\n\nDataset stats ==================== >>\nShape: (144, 1)\nTrain set size: 116\n --> From: 1949-01  To: 1958-08\nTest set size: 28\n --> From: 1958-09  To: 1960-12\n-------------------------------------\nMemory: 6.47 kB\nDuplicates: 26 (18.1%)\n\n\n\n>>> # Train models\n>>> atom.run(models=[\"NF\", \"ES\", \"ETS\"])\n\n\nTraining ========================= >>\nModels: NF, ES, ETS\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0864\nTest evaluation --> mape: -0.2303\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.090s\n-------------------------------------\nNaiveForecaster      --> mape: -0.2305\nExponentialSmoothing --> mape: -0.2303 !\nETS                  --> mape: -0.2305\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     mape_train  mape_test  time_fit      time\nNF      -0.0858    -0.2305  0.025023  0.025023\nES      -0.0864    -0.2303  0.042052  0.042052\nETS     -0.0858    -0.2305  0.021019  0.021019\n\n\n>>> print(atom.evaluate())\n\n         mae    mape         mse      r2      rmse\nNF  -91.8571 -0.2305 -10656.7143 -0.7278 -103.2314\nES  -91.8163 -0.2303 -10647.1506 -0.7263 -103.1850\nETS -91.8563 -0.2305 -10656.5266 -0.7278 -103.2305\n
                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#magic-methods", "title": "Magic methods", "text": "

                                                                                                                                                                                  The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

                                                                                                                                                                                  • __repr__: Prints an overview of atom's branches, models and metric.
                                                                                                                                                                                  • __len__: Returns the length of the dataset.
                                                                                                                                                                                  • __iter__: Iterate over the pipeline's transformers.
                                                                                                                                                                                  • __contains__: Checks if the provided item is a column in the dataset.
                                                                                                                                                                                  • __getitem__: Access a branch, model, column or subset of the dataset.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

                                                                                                                                                                                  A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

                                                                                                                                                                                  These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

                                                                                                                                                                                  This property is unavailable for sparse datasets.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesbranch: BranchCurrent active branch.

                                                                                                                                                                                  Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#utility-methods", "title": "Utility methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

                                                                                                                                                                                  addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

                                                                                                                                                                                  If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
                                                                                                                                                                                  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transform method doesn't return a dataframe:

                                                                                                                                                                                  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
                                                                                                                                                                                  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

                                                                                                                                                                                  Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

                                                                                                                                                                                  train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the transformer's fit method.

                                                                                                                                                                                  method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

                                                                                                                                                                                  This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

                                                                                                                                                                                  Note

                                                                                                                                                                                  This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

                                                                                                                                                                                  Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

                                                                                                                                                                                  inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

                                                                                                                                                                                  kw_args: dict or None, default=None Additional keyword arguments for the function.

                                                                                                                                                                                  inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

                                                                                                                                                                                  Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to plot a column's distribution.

                                                                                                                                                                                  Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

                                                                                                                                                                                  Returnspd.DataFrame Statistic results with multiindex levels:

                                                                                                                                                                                  • dist: Name of the distribution.
                                                                                                                                                                                  • stat: Statistic results:
                                                                                                                                                                                    • score: KS-test score.
                                                                                                                                                                                    • p_value: Corresponding p-value.

                                                                                                                                                                                  method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

                                                                                                                                                                                  ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This method can be slow for large datasets.

                                                                                                                                                                                  Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to report.
                                                                                                                                                                                  • If sequence: Names of two data sets to compare.
                                                                                                                                                                                  • If dict: Names of up to two data sets with corresponding selection of rows to report.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

                                                                                                                                                                                  The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsseries Sequence of weights with shape=(n_samples,).

                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                  function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

                                                                                                                                                                                  If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

                                                                                                                                                                                  Info

                                                                                                                                                                                  The loaded instance's current branch is the same branch as it was when saved.

                                                                                                                                                                                  Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

                                                                                                                                                                                  data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsatom Unpickled atom instance.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset(hard=False)[source]Reset the instance to it's initial state.

                                                                                                                                                                                  Deletes all branches and models. The dataset is also reset to its form after initialization.

                                                                                                                                                                                  Parametershard: bool, default=False If True, flushes completely the cache.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

                                                                                                                                                                                  **kwargs Additional keyword arguments for pandas' to_csv method.

                                                                                                                                                                                  method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

                                                                                                                                                                                  Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

                                                                                                                                                                                  Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

                                                                                                                                                                                  int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

                                                                                                                                                                                  str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

                                                                                                                                                                                  dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method stats()[source]Display basic information about the dataset.

                                                                                                                                                                                  method status()[source]Get an overview of the branches and models.

                                                                                                                                                                                  This method prints the same information as the __repr__ and also saves it to the logger.

                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#data-cleaning", "title": "Data cleaning", "text": "

                                                                                                                                                                                  The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

                                                                                                                                                                                  cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

                                                                                                                                                                                  method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

                                                                                                                                                                                  Use the parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Convert dtypes to the best possible types.
                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                  • Remove characters from column names.
                                                                                                                                                                                  • Strip categorical features from spaces.
                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                  • Encode the target column (ignored for regression tasks).

                                                                                                                                                                                  See the Cleaner class for a description of the parameters.

                                                                                                                                                                                  method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

                                                                                                                                                                                  For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

                                                                                                                                                                                  See the Discretizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to visualize a column's distribution and decide on the bins.

                                                                                                                                                                                  method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

                                                                                                                                                                                  The encoding type depends on the number of classes in the column:

                                                                                                                                                                                  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
                                                                                                                                                                                  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
                                                                                                                                                                                  • If n_classes > max_onehot, use strategy-encoding.

                                                                                                                                                                                  Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

                                                                                                                                                                                  See the Encoder class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the categorical attribute for a list of the categorical features in the dataset.

                                                                                                                                                                                  method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

                                                                                                                                                                                  Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

                                                                                                                                                                                  See the Imputer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the nans attribute to check the amount of missing values per column.

                                                                                                                                                                                  method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

                                                                                                                                                                                  See the Normalizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to examine a column's distribution.

                                                                                                                                                                                  method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

                                                                                                                                                                                  Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

                                                                                                                                                                                  See the Pruner class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the outliers attribute to check the number of outliers per column.

                                                                                                                                                                                  method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

                                                                                                                                                                                  Apply one of sklearn's scalers. Categorical columns are ignored.

                                                                                                                                                                                  See the Scaler class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the scaled attribute to check whether the dataset is scaled.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#nlp", "title": "NLP", "text": "

                                                                                                                                                                                  The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

                                                                                                                                                                                  textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

                                                                                                                                                                                  method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

                                                                                                                                                                                  Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the TextCleaner class for a description of the parameters.

                                                                                                                                                                                  method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

                                                                                                                                                                                  Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

                                                                                                                                                                                  See the TextNormalizer class for a description of the parameters.

                                                                                                                                                                                  method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

                                                                                                                                                                                  Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the Tokenizer class for a description of the parameters.

                                                                                                                                                                                  method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

                                                                                                                                                                                  Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

                                                                                                                                                                                  See the Vectorizer class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#feature-engineering", "title": "Feature engineering", "text": "

                                                                                                                                                                                  To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

                                                                                                                                                                                  feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

                                                                                                                                                                                  method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

                                                                                                                                                                                  Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

                                                                                                                                                                                  See the FeatureExtractor class for a description of the parameters.

                                                                                                                                                                                  method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

                                                                                                                                                                                  Create new combinations of existing features to capture the non-linear relations between the original features.

                                                                                                                                                                                  See the FeatureGenerator class for a description of the parameters.

                                                                                                                                                                                  method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

                                                                                                                                                                                  Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

                                                                                                                                                                                  See the FeatureGrouper class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

                                                                                                                                                                                  method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

                                                                                                                                                                                  Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

                                                                                                                                                                                  See the FeatureSelector class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
                                                                                                                                                                                  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomforecaster/#training", "title": "Training", "text": "

                                                                                                                                                                                  The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

                                                                                                                                                                                  runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the DirectClassifier or DirectRegressor class for a description of the parameters.

                                                                                                                                                                                  method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

                                                                                                                                                                                  The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

                                                                                                                                                                                  method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atommodel/", "title": "ATOMModel", "text": "

                                                                                                                                                                                  function atom.api.ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)[source]Convert an estimator to a model that can be ingested by atom.

                                                                                                                                                                                  This function adds the relevant attributes to the estimator so that they can be used by atom. Note that only estimators that follow sklearn's API are compatible.

                                                                                                                                                                                  Read more about custom models in the user guide.

                                                                                                                                                                                  Parametersestimator: Predictor Custom estimator. Should implement a fit and predict method.

                                                                                                                                                                                  name: str or None, default=None Name for the model. This is the value used to call the model from atom. The value should start with the model's acronym when specified. If None, the capital letters of the estimator's name are used (only if two or more, else it uses the entire name).

                                                                                                                                                                                  acronym: str or None, default=None Model's acronym. If None, it uses the model's name. Specify this parameter when you want to train multiple custom models that share the same estimator.

                                                                                                                                                                                  needs_scaling: bool, default=False Whether the model should use automated feature scaling.

                                                                                                                                                                                  native_multilabel: bool, default=False Whether the model has native support for multilabel tasks. If False and the task is multilabel, a multilabel meta-estimator is wrapper around the estimator.

                                                                                                                                                                                  native_multioutput: bool, default=False Whether the model has native support for multioutput tasks. If False and the task is multioutput, a multioutput meta-estimator is wrapped around the estimator.

                                                                                                                                                                                  has_validation: str or None, default=None Whether the model allows in-training validation.

                                                                                                                                                                                  • If None: No support for in-training validation.
                                                                                                                                                                                  • If str: Name of the estimator's parameter that states the number of iterations, e.g., n_estimators for RandomForestClassifier.

                                                                                                                                                                                  ReturnsPredictor Estimator with provided information. Provide this instance to the models parameter of the run method.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atommodel/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMRegressor, ATOMModel\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> ransac = ATOMModel(\n...     estimator=RANSACRegressor(),\n...     name=\"RANSAC\",\n...     needs_scaling=False,\n... )\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 12 (0.3%)\n\n\n>>> atom.run(ransac)\n\n\nTraining ========================= >>\nModels: RANSAC\nMetric: r2\n\n\nResults for RANSACRegressor:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.2946\nTest evaluation --> r2: 0.3787\nTime elapsed: 0.059s\n-------------------------------------------------\nTime: 0.059s\n\n\nFinal results ==================== >>\nTotal time: 0.060s\n-------------------------------------\nRANSACRegressor --> r2: 0.3787\n
                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/", "title": "ATOMRegressor", "text": "

                                                                                                                                                                                  class atom.api.ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for regression tasks.

                                                                                                                                                                                  Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

                                                                                                                                                                                  All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

                                                                                                                                                                                  Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  This parameter is ignored if the target column is provided through arrays.

                                                                                                                                                                                  index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

                                                                                                                                                                                  • If False: Reset to RangeIndex.
                                                                                                                                                                                  • If True: Use the provided index.
                                                                                                                                                                                  • If int: Position of the column to use as index.
                                                                                                                                                                                  • If str: Name of the column to use as index.
                                                                                                                                                                                  • If sequence: Array with shape=(n_samples,) to use as index.

                                                                                                                                                                                  test_size: int or float, default=0.2

                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the test set.
                                                                                                                                                                                  • If >1: Number of rows to include in the test set.

                                                                                                                                                                                  This parameter is ignored if the test set is provided through arrays.

                                                                                                                                                                                  holdout_size: int, float or None, default=None

                                                                                                                                                                                  • If None: No holdout data set is kept apart.
                                                                                                                                                                                  • If <=1: Fraction of the dataset to include in the holdout set.
                                                                                                                                                                                  • If >1: Number of rows to include in the holdout set.

                                                                                                                                                                                  This parameter is ignored if the holdout set is provided through arrays.

                                                                                                                                                                                  shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

                                                                                                                                                                                  n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

                                                                                                                                                                                  • If <=1: Fraction of the dataset to select.
                                                                                                                                                                                  • If >1: Exact number of rows to select. Only if arrays is X or X, y.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMRegressor(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 11 (0.3%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.feature_selection(strategy=\"rfecv\", solver=\"xgb\", n_features=12)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfecv selected 10 features from the dataset.\n\n\n>>> # Train models\n>>> atom.run(models=[\"OLS\", \"RF\", \"XGB\"])\n\n\nTraining ========================= >>\nModels: OLS, RF, XGB\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5313\nTest evaluation --> r2: 0.4452\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9203\nTest evaluation --> r2: 0.3471\nTime elapsed: 0.434s\n-------------------------------------------------\nTime: 0.434s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> r2: 1.0\nTest evaluation --> r2: 0.2881\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== >>\nTotal time: 0.645s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.4452 !\nRandomForest         --> r2: 0.3471 ~\nXGBoost              --> r2: 0.2881 ~\n\n\n>>> # Analyze the results\n>>> print(atom.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5313   0.4452  0.020018  0.020018\nRF     0.9203   0.3471  0.434395  0.434395\nXGB    1.0000   0.2881  0.187170  0.187170\n\n\n>>> print(atom.evaluate())\n\n         mae    mape        mse      r2     rmse\nOLS -45.1949 -0.4267 -3172.9439  0.4452 -56.3289\nRF  -49.8684 -0.4612 -3733.6766  0.3471 -61.1038\nXGB -52.0370 -0.4708 -4071.0416  0.2881 -63.8047\n
                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#magic-methods", "title": "Magic methods", "text": "

                                                                                                                                                                                  The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

                                                                                                                                                                                  • __repr__: Prints an overview of atom's branches, models and metric.
                                                                                                                                                                                  • __len__: Returns the length of the dataset.
                                                                                                                                                                                  • __iter__: Iterate over the pipeline's transformers.
                                                                                                                                                                                  • __contains__: Checks if the provided item is a column in the dataset.
                                                                                                                                                                                  • __getitem__: Access a branch, model, column or subset of the dataset.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled.

                                                                                                                                                                                  A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\".

                                                                                                                                                                                  These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values.

                                                                                                                                                                                  This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers.

                                                                                                                                                                                  This property is unavailable for sparse datasets.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                  Attributesbranch: BranchCurrent active branch.

                                                                                                                                                                                  Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use _from_ to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#utility-methods", "title": "Utility methods", "text": "

                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

                                                                                                                                                                                  addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                  method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

                                                                                                                                                                                  If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The transformer should have fit and/or transform methods with arguments X (accepting a dataframe-like object of shape=(n_samples, n_features)) and/or y (accepting a sequence of shape=(n_samples,)).
                                                                                                                                                                                  • The transform method should return a feature set as a dataframe-like object of shape=(n_samples, n_features) and/or a target column as a sequence of shape=(n_samples,).

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transform method doesn't return a dataframe:

                                                                                                                                                                                  • The column naming happens as follows. If the transformer has a get_feature_names_out method, it is used. If not, and it returns the same number of columns, the names are kept equal. If the number of columns changes, old columns will keep their name (as long as the column is unchanged) and new columns will receive the name x[N-1], where N stands for the n-th feature. This means that a transformer should only transform, add or drop columns, not combinations of these.
                                                                                                                                                                                  • The index remains the same as before the transformation. This means that the transformer should not add, remove or shuffle rows unless it returns a dataframe.

                                                                                                                                                                                  Note

                                                                                                                                                                                  If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

                                                                                                                                                                                  Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns.

                                                                                                                                                                                  train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the transformer's fit method.

                                                                                                                                                                                  method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

                                                                                                                                                                                  This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

                                                                                                                                                                                  Note

                                                                                                                                                                                  This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

                                                                                                                                                                                  Parametersfunc: callable Function to apply with signature func(dataset, **kw_args) -> dataset.

                                                                                                                                                                                  inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

                                                                                                                                                                                  kw_args: dict or None, default=None Additional keyword arguments for the function.

                                                                                                                                                                                  inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                  • Shap values
                                                                                                                                                                                  • App instance
                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                  method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

                                                                                                                                                                                  Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to plot a column's distribution.

                                                                                                                                                                                  Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns.

                                                                                                                                                                                  Returnspd.DataFrame Statistic results with multiindex levels:

                                                                                                                                                                                  • dist: Name of the distribution.
                                                                                                                                                                                  • stat: Statistic results:
                                                                                                                                                                                    • score: KS-test score.
                                                                                                                                                                                    • p_value: Corresponding p-value.

                                                                                                                                                                                  method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.

                                                                                                                                                                                  ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the report attribute. It can either report one dataset or compare two datasets against each other.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This method can be slow for large datasets.

                                                                                                                                                                                  Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                  • If str: Name of the data set to report.
                                                                                                                                                                                  • If sequence: Names of two data sets to compare.
                                                                                                                                                                                  • If dict: Names of up to two data sets with corresponding selection of rows to report.

                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target.

                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything.

                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                  method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.

                                                                                                                                                                                  The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                  Returnsseries Sequence of weights with shape=(n_samples,).

                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement an inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                  function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.

                                                                                                                                                                                  If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

                                                                                                                                                                                  Info

                                                                                                                                                                                  The loaded instance's current branch is the same branch as it was when saved.

                                                                                                                                                                                  Parametersfilename: str or Path Filename or pathlib.Path of the pickle file.

                                                                                                                                                                                  data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

                                                                                                                                                                                  • X
                                                                                                                                                                                  • X, y
                                                                                                                                                                                  • train, test
                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                  X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsatom Unpickled atom instance.

                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                  method reset(hard=False)[source]Reset the instance to it's initial state.

                                                                                                                                                                                  Deletes all branches and models. The dataset is also reset to its form after initialization.

                                                                                                                                                                                  Parametershard: bool, default=False If True, flushes completely the cache.

                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                  method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save.

                                                                                                                                                                                  **kwargs Additional keyword arguments for pandas' to_csv method.

                                                                                                                                                                                  method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.

                                                                                                                                                                                  Examples are: float64 -> float32, int64 -> int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.

                                                                                                                                                                                  Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

                                                                                                                                                                                  int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

                                                                                                                                                                                  str2cat: bool, default=False Whether to convert string to category. Only if the number of categories is less than 30% of the column's length.

                                                                                                                                                                                  dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns.

                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                  method stats()[source]Display basic information about the dataset.

                                                                                                                                                                                  method status()[source]Get an overview of the branches and models.

                                                                                                                                                                                  This method prints the same information as the __repr__ and also saves it to the logger.

                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#data-cleaning", "title": "Data cleaning", "text": "

                                                                                                                                                                                  The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

                                                                                                                                                                                  cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

                                                                                                                                                                                  method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.

                                                                                                                                                                                  Use the parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Convert dtypes to the best possible types.
                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                  • Remove characters from column names.
                                                                                                                                                                                  • Strip categorical features from spaces.
                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                  • Encode the target column (ignored for regression tasks).

                                                                                                                                                                                  See the Cleaner class for a description of the parameters.

                                                                                                                                                                                  method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

                                                                                                                                                                                  For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

                                                                                                                                                                                  See the Discretizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to visualize a column's distribution and decide on the bins.

                                                                                                                                                                                  method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

                                                                                                                                                                                  The encoding type depends on the number of classes in the column:

                                                                                                                                                                                  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
                                                                                                                                                                                  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
                                                                                                                                                                                  • If n_classes > max_onehot, use strategy-encoding.

                                                                                                                                                                                  Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

                                                                                                                                                                                  See the Encoder class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the categorical attribute for a list of the categorical features in the dataset.

                                                                                                                                                                                  method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

                                                                                                                                                                                  Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

                                                                                                                                                                                  See the Imputer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the nans attribute to check the amount of missing values per column.

                                                                                                                                                                                  method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

                                                                                                                                                                                  See the Normalizer class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_distribution method to examine a column's distribution.

                                                                                                                                                                                  method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

                                                                                                                                                                                  Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

                                                                                                                                                                                  See the Pruner class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the outliers attribute to check the number of outliers per column.

                                                                                                                                                                                  method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

                                                                                                                                                                                  Apply one of sklearn's scalers. Categorical columns are ignored.

                                                                                                                                                                                  See the Scaler class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the scaled attribute to check whether the dataset is scaled.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#nlp", "title": "NLP", "text": "

                                                                                                                                                                                  The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

                                                                                                                                                                                  textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

                                                                                                                                                                                  method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.

                                                                                                                                                                                  Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the TextCleaner class for a description of the parameters.

                                                                                                                                                                                  method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

                                                                                                                                                                                  Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

                                                                                                                                                                                  See the TextNormalizer class for a description of the parameters.

                                                                                                                                                                                  method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

                                                                                                                                                                                  Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  See the Tokenizer class for a description of the parameters.

                                                                                                                                                                                  method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

                                                                                                                                                                                  Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                  If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

                                                                                                                                                                                  See the Vectorizer class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#feature-engineering", "title": "Feature engineering", "text": "

                                                                                                                                                                                  To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

                                                                                                                                                                                  feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

                                                                                                                                                                                  method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

                                                                                                                                                                                  Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

                                                                                                                                                                                  See the FeatureExtractor class for a description of the parameters.

                                                                                                                                                                                  method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

                                                                                                                                                                                  Create new combinations of existing features to capture the non-linear relations between the original features.

                                                                                                                                                                                  See the FeatureGenerator class for a description of the parameters.

                                                                                                                                                                                  method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

                                                                                                                                                                                  Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

                                                                                                                                                                                  See the FeatureGrouper class for a description of the parameters.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use a regex pattern with the groups parameter to select groups easier, e.g., atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

                                                                                                                                                                                  method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

                                                                                                                                                                                  Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

                                                                                                                                                                                  See the FeatureSelector class for a description of the parameters.

                                                                                                                                                                                  Note

                                                                                                                                                                                  • When strategy=\"univariate\" and solver=None, f_classif or f_regression is used as default solver.
                                                                                                                                                                                  • When strategy is \"sfs\", \"rfecv\" or any of the advanced strategies and no scoring is specified, atom's metric (if it exists) is used as scoring.

                                                                                                                                                                                  "}, {"location": "API/ATOM/atomregressor/#training", "title": "Training", "text": "

                                                                                                                                                                                  The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

                                                                                                                                                                                  runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                  Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the DirectClassifier or DirectRegressor class for a description of the parameters.

                                                                                                                                                                                  method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

                                                                                                                                                                                  The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

                                                                                                                                                                                  method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                  When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                  See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

                                                                                                                                                                                  "}, {"location": "API/branch/branch/", "title": "Branch", "text": "

                                                                                                                                                                                  class atom.branch.branch.Branch(name, memory=None, data=None, holdout=None)[source]Object that contains the data.

                                                                                                                                                                                  A branch contains a specific pipeline, the dataset transformed through that pipeline, the models fitted on that dataset, and all data and utility attributes that refer to that dataset. Branches can be created and accessed through atom's branch attribute.

                                                                                                                                                                                  All public properties and attributes of the branch can be accessed from the parent.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This class should not be called directly. Branches are created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.

                                                                                                                                                                                  Parametersname: str Name of the branch.

                                                                                                                                                                                  memory: str, Memory or None, default=None Memory object for pipeline caching and to store the data when the branch is inactive.

                                                                                                                                                                                  data: DataContainer or None, default=None Data for the branch.

                                                                                                                                                                                  holdout: dataframe or None, default=None Holdout data set.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  BranchManager Object that manages branches.

                                                                                                                                                                                  "}, {"location": "API/branch/branch/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 177 (1.3%)\n\n\n\n>>> # Train a model\n>>> atom.run(\"RF\")\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9517\nTime elapsed: 0.236s\n-------------------------------------------------\nTime: 0.236s\n\n\nFinal results ==================== >>\nTotal time: 0.239s\n-------------------------------------\nRandomForest --> f1: 0.9517\n\n\n>>> # Change the branch and apply feature scaling\n>>> atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.run(\"RF_scaled\")\n\n\nTraining ========================= >>\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9517\nTime elapsed: 0.237s\n-------------------------------------------------\nTime: 0.237s\n\n\nFinal results ==================== >>\nTotal time: 0.240s\n-------------------------------------\nRandomForest --> f1: 0.9517\n\n\n>>> # Compare the models\n>>> atom.plot_roc()\n
                                                                                                                                                                                  "}, {"location": "API/branch/branch/#attributes", "title": "Attributes", "text": "

                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/branch/branch/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  loadLoad the branch's data from memory.storeStore the branch's data as a pickle in memory.

                                                                                                                                                                                  method load(assign=True)[source]Load the branch's data from memory.

                                                                                                                                                                                  This method is used to restore the data of inactive branches.

                                                                                                                                                                                  Parametersassign: bool, default=True Whether to assign the loaded data to self.

                                                                                                                                                                                  ReturnsDataContainer or None Own data information. Returns None if no data is set.

                                                                                                                                                                                  method store(assign=True)[source]Store the branch's data as a pickle in memory.

                                                                                                                                                                                  After storage, the data is deleted, and the branch is no longer usable until load is called. This method is used to store the data for inactive branches.

                                                                                                                                                                                  Note

                                                                                                                                                                                  This method is skipped silently for branches with no memory allocation.

                                                                                                                                                                                  Parametersassign: bool, default=True Whether to assign None to the data in self.

                                                                                                                                                                                  "}, {"location": "API/branch/branchmanager/", "title": "BranchManager", "text": "

                                                                                                                                                                                  class atom.branch.branchmanager.BranchManager(memory=None)[source]Object that manages branches.

                                                                                                                                                                                  Maintains references to a series of branches and the current active branch. Additionally, always stores an 'original' branch containing the original dataset (previous to any transformations). The branches share a reference to a holdout set, not the instance self. When a memory object is specified, it stores inactive branches in memory.

                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  This class should not be called directly. The BranchManager is created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.

                                                                                                                                                                                  Parametersmemory: str, Memory or None, default=None Location to store inactive branches. If None, all branches are kept in memory. This memory object is passed to the branches for pipeline caching.

                                                                                                                                                                                  Attributesbranches: ClassMap Collection of branches.

                                                                                                                                                                                  og: Branch Branch containing the original dataset. It can be any branch in branches or an internally made branch called og.

                                                                                                                                                                                  current: Branch Current active branch.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Branch Object that contains the data.

                                                                                                                                                                                  "}, {"location": "API/branch/branchmanager/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 174 (1.2%)\n\n\n\n>>> # Train a model\n>>> atom.run(\"RF\")\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== >>\nTotal time: 0.232s\n-------------------------------------\nRandomForest --> f1: 0.9655\n\n\n>>> # Change the branch and apply feature scaling\n>>> atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.run(\"RF_scaled\")\n\n\nTraining ========================= >>\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9722\nTime elapsed: 0.228s\n-------------------------------------------------\nTime: 0.228s\n\n\nFinal results ==================== >>\nTotal time: 0.231s\n-------------------------------------\nRandomForest --> f1: 0.9722\n\n\n>>> # Compare the models\n>>> atom.plot_roc()\n
                                                                                                                                                                                  "}, {"location": "API/branch/branchmanager/#attributes", "title": "Attributes", "text": "

                                                                                                                                                                                  Attributesbranches: ClassMap Collection of branches.

                                                                                                                                                                                  og: Branch Branch containing the original dataset. It can be any branch in branches or an internally made branch called og.

                                                                                                                                                                                  current: Branch Current active branch.

                                                                                                                                                                                  "}, {"location": "API/branch/branchmanager/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  addAdd a new branch to the manager.fillFill the current branch with data.resetReset this instance to its initial state.

                                                                                                                                                                                  method add(name, parent=None)[source]Add a new branch to the manager.

                                                                                                                                                                                  If the branch is called og (reserved name for the original branch), it's created separately and stored in memory.

                                                                                                                                                                                  Parametersname: str Name for the new branch.

                                                                                                                                                                                  parent: Branch or None, default=None Parent branch. Data and attributes from the parent are passed to the new branch.

                                                                                                                                                                                  method fill(data, holdout=None)[source]Fill the current branch with data.

                                                                                                                                                                                  Parametersdata: DataContainer New data for the current branch.

                                                                                                                                                                                  holdout: dataframe or None, default=None Holdout data set (if any).

                                                                                                                                                                                  method reset(hard=False)[source]Reset this instance to its initial state.

                                                                                                                                                                                  The initial state of the BranchManager contains a single branch called main with no data. There's no reference to an original (og) branch.

                                                                                                                                                                                  Parametershard: bool, default=False If True, flushes completely the cache.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/balancer/", "title": "Balancer", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Balancer(strategy=\"ADASYN\", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Balance the number of samples per class in the target column.

                                                                                                                                                                                  When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set. Use only for classification tasks.

                                                                                                                                                                                  This class can be accessed from atom through the balance method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • The clustercentroids estimator is unavailable because of incompatibilities of the APIs.
                                                                                                                                                                                  • The Balancer class does not support multioutput tasks.

                                                                                                                                                                                  Parametersstrategy: str or estimator, default=\"ADASYN\" Type of algorithm with which to balance the dataset. Choose from the name of any estimator in the imbalanced-learn package or provide a custom instance of such.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 - value.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                  Attributes[strategy]_: imblearn estimator Object (lowercase strategy) used to balance the data, e.g., balancer.adasyn_ for the default strategy.

                                                                                                                                                                                  mapping_: dict Target values mapped to their respective encoded integers.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  target_names_in_: np.ndarray Names of the target column seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Encoder Perform encoding of categorical features.

                                                                                                                                                                                  Imputer Handle missing values in the data.

                                                                                                                                                                                  Pruner Prune outliers from the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/balancer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.train)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630             0.054390         0.1720  ...           107.30       740.4            0.1610            0.42250          0.50300               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690             0.094510         0.1860  ...           142.20      1493.0            0.1492            0.25360          0.37590               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699             0.047440         0.1538  ...           135.10      1320.0            0.1315            0.18060          0.20800               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686             0.027390         0.1852  ...           110.10       931.4            0.1148            0.09866          0.15470               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263             0.023080         0.1305  ...            63.34       270.0            0.1179            0.18790          0.15440               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n451        19.73         19.82          130.70     1206.0          0.10620           0.18490         0.24170             0.097400         0.1733  ...           159.80      1933.0            0.1710            0.59550          0.84890               0.25070          0.2749                  0.12970       0\n452        12.72         13.78           81.78      492.1          0.09667           0.08393         0.01288             0.019240         0.1638  ...            88.54       553.7            0.1298            0.14720          0.05233               0.06343          0.2369                  0.06922       1\n453        11.51         23.93           74.52      403.5          0.09261           0.10210         0.11120             0.041050         0.1388  ...            82.28       474.2            0.1298            0.25170          0.36300               0.09653          0.2112                  0.08732       1\n454        10.75         14.97           68.26      355.3          0.07793           0.05139         0.02251             0.007875         0.1399  ...            77.79       441.2            0.1076            0.12230          0.09755               0.03413          0.2300                  0.06769       1\n455        25.22         24.91          171.50     1878.0          0.10630           0.26650         0.33390             0.184500         0.1829  ...           211.70      2562.0            0.1573            0.60760          0.64760               0.28670          0.2355                  0.10510       0\n\n[456 rows x 31 columns]\n\n\n>>> atom.balance(strategy=\"smote\", verbose=2)\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n\n>>> # Note that the number of rows has increased\n>>> print(atom.train)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      13.480000     20.820000       88.400000   559.200000         0.101600          0.125500        0.106300             0.054390       0.172000  ...       107.300000   740.400000          0.161000           0.422500         0.503000              0.225800        0.280700                 0.107100       0\n1      18.310000     20.580000      120.800000  1052.000000         0.106800          0.124800        0.156900             0.094510       0.186000  ...       142.200000  1493.000000          0.149200           0.253600         0.375900              0.151000        0.307400                 0.078630       0\n2      17.930000     24.480000      115.200000   998.900000         0.088550          0.070270        0.056990             0.047440       0.153800  ...       135.100000  1320.000000          0.131500           0.180600         0.208000              0.113600        0.250400                 0.079480       0\n3      15.130000     29.810000       96.710000   719.500000         0.083200          0.046050        0.046860             0.027390       0.185200  ...       110.100000   931.400000          0.114800           0.098660         0.154700              0.065750        0.323300                 0.061650       0\n4       8.950000     15.760000       58.740000   245.200000         0.094620          0.124300        0.092630             0.023080       0.130500  ...        63.340000   270.000000          0.117900           0.187900         0.154400              0.038460        0.165200                 0.077220       1\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...              ...          ...               ...                ...              ...                   ...             ...                      ...     ...\n567    15.182945     22.486774       98.949465   711.386079         0.092513          0.102732        0.113923             0.069481       0.179224  ...       107.689157   826.276172          0.126730           0.199259         0.295172              0.142325        0.265352                 0.068318       0\n568    19.990378     20.622944      130.491182  1253.735467         0.091583          0.117753        0.117236             0.082771       0.202428  ...       167.456689  1995.896044          0.132457           0.289652         0.332006              0.182989        0.299088                 0.084150       0\n569    18.158121     18.928220      119.907435  1027.331092         0.113149          0.147089        0.171862             0.103942       0.209306  ...       135.286302  1319.270051          0.127029           0.233493         0.260138              0.133851        0.302406                 0.079535       0\n570    23.733233     26.433751      158.185672  1724.145541         0.098008          0.193789        0.231158             0.139527       0.188817  ...       207.483796  2844.559632          0.150495           0.463361         0.599077              0.266433        0.290828                 0.091542       0\n571    17.669575     16.375717      115.468589   968.552411         0.093636          0.109983        0.101005             0.075283       0.174505  ...       133.767576  1227.195245          0.118221           0.264624         0.249798              0.135098        0.268044                 0.076533       0\n\n[572 rows x 31 columns]\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Balancer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[569 rows x 30 columns]\n\n\n>>> balancer = Balancer(strategy=\"smote\", verbose=2)\n>>> X, y = balancer.fit_transform(X, y)\n\nOversampling with SMOTE...\n --> Adding 145 samples to class 0.\n\n\n>>> # Note that the number of rows has increased\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0      17.990000     10.380000      122.800000  1001.000000         0.118400          0.277600        0.300100             0.147100       0.241900  ...      17.330000       184.600000  2019.000000          0.162200           0.665600         0.711900              0.265400        0.460100                 0.118900\n1      20.570000     17.770000      132.900000  1326.000000         0.084740          0.078640        0.086900             0.070170       0.181200  ...      23.410000       158.800000  1956.000000          0.123800           0.186600         0.241600              0.186000        0.275000                 0.089020\n2      19.690000     21.250000      130.000000  1203.000000         0.109600          0.159900        0.197400             0.127900       0.206900  ...      25.530000       152.500000  1709.000000          0.144400           0.424500         0.450400              0.243000        0.361300                 0.087580\n3      11.420000     20.380000       77.580000   386.100000         0.142500          0.283900        0.241400             0.105200       0.259700  ...      26.500000        98.870000   567.700000          0.209800           0.866300         0.686900              0.257500        0.663800                 0.173000\n4      20.290000     14.340000      135.100000  1297.000000         0.100300          0.132800        0.198000             0.104300       0.180900  ...      16.670000       152.200000  1575.000000          0.137400           0.205000         0.400000              0.162500        0.236400                 0.076780\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...            ...              ...          ...               ...                ...              ...                   ...             ...                      ...\n709    19.478557     23.348123      128.995257  1164.950583         0.101810          0.143231        0.194792             0.095794       0.198376  ...      30.482866       143.381227  1362.533650          0.135197           0.267786         0.365230              0.170069        0.273984                 0.076077\n710    18.752895     20.824323      124.472875  1084.317645         0.096491          0.171270        0.177021             0.095356       0.204866  ...      27.544127       160.451305  1623.116663          0.133721           0.506298         0.521417              0.203921        0.348906                 0.098688\n711    17.182368     21.204540      112.271609   925.918840         0.100517          0.110961        0.110803             0.076692       0.204604  ...      28.119577       142.316398  1439.815962          0.155602           0.277795         0.388351              0.207039        0.334574                 0.080310\n712    18.285452     20.578363      120.603613  1048.317740         0.106252          0.125135        0.153635             0.093128       0.188095  ...      26.188544       142.298194  1487.517523          0.147703           0.251890         0.365958              0.150828        0.308848                 0.078435\n713    14.550791     25.918705       96.913441   655.023273         0.111607          0.166865        0.158127             0.077468       0.228924  ...      36.072516       123.641397   930.709825          0.163673           0.659480         0.662486              0.197880        0.423041                 0.132320\n\n[714 rows x 30 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/balancer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBalance the data.

                                                                                                                                                                                  method fit(X, y=-1)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, dict or sequence, default=-1 Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=-1)[source]Balance the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str or sequence, default=-1 Target column corresponding to `X`.

                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                  Returnsdataframe Balanced dataframe.

                                                                                                                                                                                  series Transformed target column.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/cleaner/", "title": "Cleaner", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None)[source]Applies standard data cleaning steps on a dataset.

                                                                                                                                                                                  Use the parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                  • Convert dtypes to the best possible types.
                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                  • Remove characters from column names.
                                                                                                                                                                                  • Strip categorical features from spaces.
                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                  • Encode the target column.

                                                                                                                                                                                  This class can be accessed from atom through the clean method. Read more in the user guide.

                                                                                                                                                                                  Parametersconvert_dtypes: bool, default=True Convert the column's data types to the best possible types that support pd.NA.

                                                                                                                                                                                  drop_dtypes: str, sequence or None, default=None Columns with these data types are dropped from the dataset.

                                                                                                                                                                                  drop_chars: str or None, default=None Remove the specified regex pattern from column names, e.g. [^A-Za-z0-9]+ to remove all non-alphanumerical characters.

                                                                                                                                                                                  strip_categorical: bool, default=True Whether to strip spaces from categorical columns.

                                                                                                                                                                                  drop_duplicates: bool, default=False Whether to drop duplicate rows. Only the first occurrence of every duplicated row is kept.

                                                                                                                                                                                  drop_missing_target: bool, default=True Whether to drop rows with missing values in the target column. This transformation is ignored if y is not provided.

                                                                                                                                                                                  encode_target: bool, default=True Whether to encode the target column(s). This includes converting categorical columns to numerical, and binarizing multilabel columns. This transformation is ignored if y is not provided.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

                                                                                                                                                                                  mapping_: dict Target values mapped to their respective encoded integers. Only available if encode_target=True.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  target_names_in_: np.ndarray Names of the target column(s) seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Encoder Perform encoding of categorical features.

                                                                                                                                                                                  Discretizer Bin continuous data into intervals.

                                                                                                                                                                                  Scaler Scale the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/cleaner/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> y = [\"a\" if i else \"b\" for i in y]\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.y)\n\n0      a\n1      a\n2      a\n3      a\n4      a\n      ..\n564    a\n565    a\n566    a\n567    a\n568    b\nName: target, Length: 569, dtype: object\n\n\n>>> atom.clean(verbose=2)\n\nFitting Cleaner...\nCleaning the data...\n --> Label-encoding column target.\n\n\n>>> print(atom.y)\n\n0      0\n1      0\n2      0\n3      0\n4      0\n      ..\n564    0\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: Int64\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Cleaner\n>>> from numpy.random import randint\n\n>>> y = [\"a\" if i else \"b\" for i in range(randint(100))]\n\n>>> cleaner = Cleaner(verbose=2)\n>>> y = cleaner.fit_transform(y=y)\n\nFitting Cleaner...\nCleaning the data...\n --> Label-encoding column target.\n\n\n>>> print(y)\n\n0     1\n1     0\n2     0\n3     0\n4     0\n5     0\n6     0\n7     0\n8     0\n9     0\n10    0\n11    0\n12    0\n13    0\n14    0\n15    0\n16    0\n17    0\n18    0\n19    0\n20    0\n21    0\n22    0\n23    0\n24    0\n25    0\n26    0\n27    0\n28    0\n29    0\n30    0\n31    0\n32    0\n33    0\n34    0\n35    0\n36    0\nName: target, dtype: Int64\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/cleaner/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformInversely transform the label encoding.set_paramsSet the parameters of this estimator.transformApply the data cleaning steps to the data.

                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Inversely transform the label encoding.

                                                                                                                                                                                  This method only inversely transforms the target encoding. The rest of the transformations can't be inverted. If encode_target=False, the data is returned as is.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Unchanged feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X=None, y=None)[source]Apply the data cleaning steps to the data.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series Transformed target column. Only returned if provided.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/discretizer/", "title": "Discretizer", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Discretizer(strategy=\"quantile\", bins=5, labels=None, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Bin continuous data into intervals.

                                                                                                                                                                                  For each feature, the bin edges are computed during fit and, together with the number of bins, they define the intervals. Ignores categorical columns.

                                                                                                                                                                                  This class can be accessed from atom through the discretize method. Read more in the user guide.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  The transformation returns categorical columns. Use the Encoder class to convert them back to numerical types.

                                                                                                                                                                                  Parametersstrategy: str, default=\"quantile\" Strategy used to define the widths of the bins. Choose from:

                                                                                                                                                                                  • \"uniform\": All bins have identical widths.
                                                                                                                                                                                  • \"quantile\": All bins have the same number of points.
                                                                                                                                                                                  • \"kmeans\": Values in each bin have the same nearest center of a 1D k-means cluster.
                                                                                                                                                                                  • \"custom\": Use custom bin edges provided through bins.

                                                                                                                                                                                  bins: int, sequence or dict, default=5 Bin number or bin edges in which to split every column.

                                                                                                                                                                                  • If int: Number of bins to produce for all columns. Only for strategy!=\"custom\".
                                                                                                                                                                                  • If sequence:

                                                                                                                                                                                    • For strategy!=\"custom\": Number of bins per column. The n-th value corresponds to the n-th column that is transformed. Categorical columns are ignored.
                                                                                                                                                                                    • For strategy=\"custom\": Bin edges with length=n_bins - 1. The outermost edges are always -inf and +inf, e.g., bins [1, 2] indicate (-inf, 1], (1, 2], (2, inf].
                                                                                                                                                                                  • If dict: One of the aforementioned options per column, where the key is the column's name. Columns that are not in the dictionary are not transformed.

                                                                                                                                                                                  labels: sequence, dict or None, default=None Label names with which to replace the binned intervals.

                                                                                                                                                                                  • If None: Use default labels of the form (min_edge, max_edge].
                                                                                                                                                                                  • If sequence: Labels to use for all columns.
                                                                                                                                                                                  • If dict: Labels per column, where the key is the column's name. Columns that are not in the dictionary use the default labels.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random. Only for strategy=\"quantile\".

                                                                                                                                                                                  Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Encoder Perform encoding of categorical features.

                                                                                                                                                                                  Imputer Handle missing values in the data.

                                                                                                                                                                                  Normalizer Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/discretizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom[\"mean radius\"])\n\n0      13.48\n1      18.31\n2      17.93\n3      15.13\n4       8.95\n       ...  \n564    14.34\n565    13.17\n566    17.30\n567    17.68\n568    14.80\nName: mean radius, Length: 569, dtype: float64\n\n\n>>> atom.discretize(\n...     strategy=\"custom\",\n...     bins=[13, 18],\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n...     columns=\"mean radius\",\n... )\n\nFitting Discretizer...\nBinning the features...\n --> Discretizing feature mean radius in 3 bins.\n\n\n>>> print(atom[\"mean radius\"])\n\n0      medium\n1       large\n2      medium\n3      medium\n4       small\n        ...  \n564    medium\n565    medium\n566    medium\n567    medium\n568    medium\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' < 'medium' < 'large']\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Discretizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> print(X[\"mean radius\"])\n\n0      17.99\n1      20.57\n2      19.69\n3      11.42\n4      20.29\n       ...  \n564    21.56\n565    20.13\n566    16.60\n567    20.60\n568     7.76\nName: mean radius, Length: 569, dtype: float64\n\n\n>>> discretizer = Discretizer(\n...     strategy=\"custom\",\n...     bins={\"mean radius\": [13, 18]},\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n... )\n>>> X = discretizer.fit_transform(X)\n\nFitting Discretizer...\nBinning the features...\n --> Discretizing feature mean radius in 3 bins.\n\n\n>>> print(X[\"mean radius\"])\n\n0      medium\n1       large\n2       large\n3       small\n4       large\n        ...  \n564     large\n565     large\n566    medium\n567     large\n568     small\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' < 'medium' < 'large']\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/discretizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBin the data into intervals.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Bin the data into intervals.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/encoder/", "title": "Encoder", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Encoder(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"infrequent\", n_jobs=1, verbose=0, logger=None, **kwargs)[source]Perform encoding of categorical features.

                                                                                                                                                                                  The encoding type depends on the number of classes in the column:

                                                                                                                                                                                  • If n_classes=2 or ordinal feature, use Ordinal-encoding.
                                                                                                                                                                                  • If 2 < n_classes <= max_onehot, use OneHot-encoding.
                                                                                                                                                                                  • If n_classes > max_onehot, use strategy-encoding.

                                                                                                                                                                                  Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Infrequent classes can be replaced with a value in order to prevent too high cardinality.

                                                                                                                                                                                  This class can be accessed from atom through the encode method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Three category-encoders estimators are unavailable:

                                                                                                                                                                                  • OneHotEncoder: Use the max_onehot parameter.
                                                                                                                                                                                  • HashingEncoder: Incompatibility of APIs.
                                                                                                                                                                                  • LeaveOneOutEncoder: Incompatibility of APIs.

                                                                                                                                                                                  Parametersstrategy: str or estimator, default=\"Target\" Type of encoding to use for high cardinality features. Choose from any of the estimators in the category-encoders package or provide a custom one.

                                                                                                                                                                                  max_onehot: int or None, default=10 Maximum number of unique values in a feature to perform one-hot encoding. If None, strategy-encoding is always used for columns with more than two classes.

                                                                                                                                                                                  ordinal: dict or None, default=None Order of ordinal features, where the dict key is the feature's name and the value is the class order, e.g., {\"salary\": [\"low\", \"medium\", \"high\"]}.

                                                                                                                                                                                  infrequent_to_value: int, float or None, default=None Replaces infrequent class occurrences in categorical columns with the string in parameter value. This transformation is done before the encoding of the column.

                                                                                                                                                                                  • If None: Skip this step.
                                                                                                                                                                                  • If int: Minimum number of occurrences in a class.
                                                                                                                                                                                  • If float: Minimum fraction of occurrences in a class.

                                                                                                                                                                                  value: str, default=\"infrequent\" Value with which to replace rare classes. This parameter is ignored if infrequent_to_value=None.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 - value.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                  Attributesmapping_: dict of dicts Encoded values and their respective mapping. The column name is the key to its mapping dictionary. Only for ordinal encoding.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Cleaner Applies standard data cleaning steps on a dataset.

                                                                                                                                                                                  Imputer Handle missing values in the data.

                                                                                                                                                                                  Pruner Prune outliers from the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/encoder/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from numpy.random import randint\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n>>> X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n>>> X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710             x0             x1            x17\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863             x0             x0            x15\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948             x1             x0            x16\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165             x0             x0            x13\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722             x0             x1            x11\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072             x0             x2            x11\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618             x1             x1             x5\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113             x0             x1            x17\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738             x0             x0             x2\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285             x0             x2            x14\n\n[569 rows x 33 columns]\n\n\n>>> atom.encode(strategy=\"target\", max_onehot=10, verbose=2)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --> OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --> Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n>>> # Note the one-hot encoded column with name [feature]_[class]\n>>> print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x1  cat_feature_2_x0  cat_feature_2_x2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           0.5030               0.22580          0.2807                  0.10710            0.0               1.0               0.0               0.0       0.622917\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           0.3759               0.15100          0.3074                  0.07863            0.0               0.0               1.0               0.0       0.619953\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           0.2080               0.11360          0.2504                  0.07948            1.0               0.0               1.0               0.0       0.636924\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           0.1547               0.06575          0.3233                  0.06165            0.0               0.0               1.0               0.0       0.585368\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...           0.1544               0.03846          0.1652                  0.07722            0.0               1.0               0.0               0.0       0.638596\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           0.1632               0.10870          0.3062                  0.06072            0.0               0.0               0.0               1.0       0.638596\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           0.3728               0.16070          0.3693                  0.09618            1.0               1.0               0.0               0.0       0.588596\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           0.3378               0.18570          0.3138                  0.08113            0.0               1.0               0.0               0.0       0.622917\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           0.3583               0.15150          0.2463                  0.07738            0.0               0.0               1.0               0.0       0.688596\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           0.2060               0.08308          0.3600                  0.07285            0.0               0.0               0.0               1.0       0.662643\n\n[569 rows x 35 columns]\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Encoder\n>>> from sklearn.datasets import load_breast_cancer\n>>> from numpy.random import randint\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n>>> X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n>>> X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n>>> X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890             x1             x2             x5\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902             x1             x2            x13\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758             x0             x0            x15\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300             x0             x2            x10\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678             x1             x1            x17\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115             x1             x1            x12\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637             x0             x2            x14\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820             x0             x1             x3\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400             x1             x0             x2\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039             x1             x1            x11\n\n[569 rows x 33 columns]\n\n\n>>> encoder = Encoder(strategy=\"target\", max_onehot=10, verbose=2)\n>>> X = encoder.fit_transform(X, y)\n\nFitting Encoder...\nEncoding categorical columns...\n --> Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --> OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --> Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n>>> # Note the one-hot encoded column with name [feature]_[class]\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x2  cat_feature_2_x0  cat_feature_2_x1  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.7119                0.2654          0.4601                  0.11890            1.0               1.0               0.0               0.0       0.645086\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.2416                0.1860          0.2750                  0.08902            1.0               1.0               0.0               0.0       0.604148\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.4504                0.2430          0.3613                  0.08758            0.0               0.0               1.0               0.0       0.675079\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.6869                0.2575          0.6638                  0.17300            0.0               1.0               0.0               0.0       0.706297\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.4000                0.1625          0.2364                  0.07678            1.0               0.0               0.0               1.0       0.716566\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.4107                0.2216          0.2060                  0.07115            1.0               0.0               0.0               1.0       0.598024\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.3215                0.1628          0.2572                  0.06637            0.0               1.0               0.0               0.0       0.683185\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.3403                0.1418          0.2218                  0.07820            0.0               0.0               0.0               1.0       0.472908\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.9387                0.2650          0.4087                  0.12400            1.0               0.0               1.0               0.0       0.585452\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.0000                0.0000          0.2871                  0.07039            1.0               0.0               0.0               1.0       0.516759\n\n[569 rows x 35 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/encoder/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformEncode the data.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  Note that leaving y=None can lead to errors if the strategy encoder requires target values. For multioutput tasks, only the first target column is used to fit the encoder.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, dict, sequence or dataframe-like Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Encode the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Encoded dataframe.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/imputer/", "title": "Imputer", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Imputer(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Handle missing values in the data.

                                                                                                                                                                                  Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

                                                                                                                                                                                  This class can be accessed from atom through the impute method. Read more in the user guide.

                                                                                                                                                                                  Parametersstrat_num: str, int or float, default=\"drop\" Imputing strategy for numerical columns. Choose from:

                                                                                                                                                                                  • \"drop\": Drop rows containing missing values.
                                                                                                                                                                                  • \"mean\": Impute with mean of column.
                                                                                                                                                                                  • \"median\": Impute with median of column.
                                                                                                                                                                                  • \"knn\": Impute using a K-Nearest Neighbors approach.
                                                                                                                                                                                  • \"iterative\": Impute using a multivariate imputer.
                                                                                                                                                                                  • \"most_frequent\": Impute with the most frequent value.
                                                                                                                                                                                  • int or float: Impute with provided numerical value.

                                                                                                                                                                                  strat_cat: str, default=\"drop\" Imputing strategy for categorical columns. Choose from:

                                                                                                                                                                                  • \"drop\": Drop rows containing missing values.
                                                                                                                                                                                  • \"most_frequent\": Impute with the most frequent value.
                                                                                                                                                                                  • str: Impute with provided string.

                                                                                                                                                                                  max_nan_rows: int, float or None, default=None Maximum number or fraction of missing values in a row (if more, the row is removed). If None, ignore this step.

                                                                                                                                                                                  max_nan_cols: int, float or None, default=None Maximum number or fraction of missing values in a column (if more, the column is removed). If None, ignore this step.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 - value.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random. Only used when strat_num=\"iterative\".

                                                                                                                                                                                  Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Balancer Balance the number of samples per class in the target column.

                                                                                                                                                                                  Discretizer Bin continuous data into intervals.

                                                                                                                                                                                  Encoder Perform encoding of categorical features.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/imputer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from numpy.random import randint\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add some random missing values to the data\n>>> for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iat[i, j] = np.NaN\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.nans)\n\nmean radius                130\nmean texture               141\nmean perimeter             124\nmean area                  136\nmean smoothness              0\nmean compactness             0\nmean concavity               0\nmean concave points          0\nmean symmetry                0\nmean fractal dimension       0\nradius error                 0\ntexture error                0\nperimeter error              0\narea error                   0\nsmoothness error             0\ncompactness error            0\nconcavity error              0\nconcave points error         0\nsymmetry error               0\nfractal dimension error      0\nworst radius                 0\nworst texture                0\nworst perimeter              0\nworst area                   0\nworst smoothness             0\nworst compactness            0\nworst concavity              0\nworst concave points         0\nworst symmetry               0\nworst fractal dimension      0\ndtype: int64\n\n\n>>> atom.impute(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n\nFitting Imputer...\nImputing missing values...\n --> Imputing 130 missing values with median (13.27) in feature mean radius.\n --> Imputing 141 missing values with median (18.87) in feature mean texture.\n --> Imputing 124 missing values with median (85.66) in feature mean perimeter.\n --> Imputing 136 missing values with median (555.1) in feature mean area.\n\n\n>>> print(atom.n_nans)\n\n0\n
                                                                                                                                                                                  >>> import numpy as np\n>>> from atom.data_cleaning import Imputer\n>>> from numpy.random import randint\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add some random missing values to the data\n>>> for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iloc[i, j] = np.nan\n\n>>> imputer = Imputer(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n>>> X, y = imputer.fit_transform(X, y)\n\nFitting Imputer...\nImputing missing values...\n --> Dropping 2 samples for containing more than 3 missing values.\n --> Imputing 124 missing values with median (13.38) in feature mean radius.\n --> Imputing 127 missing values with median (18.87) in feature mean texture.\n --> Imputing 137 missing values with median (86.54) in feature mean perimeter.\n --> Imputing 134 missing values with median (561.3) in feature mean area.\n\n\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          13.38        10.380         122.800     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57        17.770          86.545      561.3          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69        21.250         130.000     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42        20.380          77.580      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          13.38        14.340         135.100     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56        22.390          86.545      561.3          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13        18.865         131.200     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        13.38        28.080          86.545      561.3          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60        29.330         140.100     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568        13.38        24.540          47.920      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[567 rows x 30 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/imputer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformImpute the missing values.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Impute the missing values.

                                                                                                                                                                                  Note that leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Imputed dataframe.

                                                                                                                                                                                  series Transformed target column. Only returned if provided.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/normalizer/", "title": "Normalizer", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Normalizer(strategy=\"yeojohnson\", device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None, **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Categorical columns are ignored.

                                                                                                                                                                                  This class can be accessed from atom through the normalize method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  The quantile strategy performs a non-linear transformation. This may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.

                                                                                                                                                                                  Note

                                                                                                                                                                                  The yeojohnson and boxcox strategies scale the data after transforming. Use the kwargs to change this behavior.

                                                                                                                                                                                  Parametersstrategy: str, default=\"yeojohnson\" The transforming strategy. Choose from:

                                                                                                                                                                                  • \"yeojohnson\"
                                                                                                                                                                                  • \"boxcox\" (only works with strictly positive values)
                                                                                                                                                                                  • \"quantile\": Transform features using quantiles information.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the quantile strategy. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                  Attributes[strategy]_: sklearn transformer Object with which the data is transformed, e.g., normalizer.yeojohnson for the default strategy.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Cleaner Applies standard data cleaning steps on a dataset.

                                                                                                                                                                                  Pruner Prune outliers from the data.

                                                                                                                                                                                  Scaler Scale the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/normalizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
                                                                                                                                                                                  >>> atom.normalize(verbose=2)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.017068      0.464087        0.031104  -0.020222         0.390628          0.620790        0.562136             0.426774      -0.280554  ...         0.251532    0.081524          1.224389           1.206519         1.189835              1.522769       -0.043007                 1.378960       0\n1       1.182066      0.411242        1.183030   1.200556         0.741209          0.608244        1.100342             1.256472       0.256014  ...         1.119375    1.218096          0.759546           0.244492         0.726989              0.650523        0.424017                -0.164104       0\n2       1.105309      1.197684        1.018344   1.106437        -0.552214         -0.652544       -0.230044             0.226950      -1.050816  ...         0.973194    1.037232          0.002307          -0.374986        -0.128679              0.107299       -0.647198                -0.100126       0\n3       0.455144      2.077941        0.379512   0.486019        -0.966587         -1.447057       -0.438308            -0.480189       0.226570  ...         0.337722    0.483003         -0.785100          -1.301043        -0.483292             -0.722786        0.676588                -1.783846       0\n4      -1.898537     -0.815757       -1.745528  -1.873415        -0.102067          0.599235        0.374346            -0.662103      -2.173761  ...        -1.869111   -2.095123         -0.633206          -0.305478        -0.485431             -1.278472       -2.898859                -0.273347       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.238929     -1.546154        0.209113   0.257899         0.214334         -0.482480       -0.225132             0.183841       0.996371  ...         0.346743    0.373205         -0.079012          -0.660736        -0.423384              0.029761        0.404215                -1.894769       1\n565    -0.115233      0.675396       -0.105672  -0.125511         0.078814          0.213069        0.222118             0.375009      -0.177404  ...         0.194134    0.082260          0.804177           1.061384         0.714032              0.778530        1.315113                 0.913117       0\n566     0.972621     -0.443853        0.950416   0.971288         0.335466          0.200161        0.804757             1.074782       0.080964  ...         0.880583    0.920102          0.443592           0.144776         0.561298              1.086695        0.527842                 0.020173       0\n567     1.053489      0.446545        1.084407   1.040647         1.046541          1.237987        1.321388             1.410770       0.650180  ...         0.925288    1.016604          0.452080           0.855688         0.652219              0.657243       -0.735710                -0.260751       0\n568     0.366875     -0.289945        0.346701   0.359700        -0.309357         -0.150999       -0.574459            -0.683107       0.375972  ...         0.207028    0.284140         -0.407994          -0.303600        -0.141124             -0.402554        1.196110                -0.638106       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Normalizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> normalizer = Normalizer(verbose=2)\n>>> X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/normalizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Original dataframe.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Apply the transformations to the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Normalized dataframe.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/pruner/", "title": "Pruner", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Pruner(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Prune outliers from the data.

                                                                                                                                                                                  Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

                                                                                                                                                                                  This class can be accessed from atom through the prune method. Read more in the user guide.

                                                                                                                                                                                  Info

                                                                                                                                                                                  The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"dbscan\".

                                                                                                                                                                                  Parametersstrategy: str or sequence, default=\"zscore\" Strategy with which to select the outliers. If sequence of strategies, only samples marked as outliers by all chosen strategies are dropped. Choose from:

                                                                                                                                                                                  • \"zscore\": Z-score of each data value.
                                                                                                                                                                                  • \"iforest\": Isolation Forest.
                                                                                                                                                                                  • \"ee\": Elliptic Envelope.
                                                                                                                                                                                  • \"lof\": Local Outlier Factor.
                                                                                                                                                                                  • \"svm\": One-class SVM.
                                                                                                                                                                                  • \"dbscan\": Density-Based Spatial Clustering.
                                                                                                                                                                                  • \"hdbscan\": Hierarchical Density-Based Spatial Clustering.
                                                                                                                                                                                  • \"optics\": DBSCAN-like clustering approach.

                                                                                                                                                                                  method: int, float or str, default=\"drop\" Method to apply on the outliers. Only the zscore strategy accepts another method than \"drop\". Choose from:

                                                                                                                                                                                  • \"drop\": Drop any sample with outlier values.
                                                                                                                                                                                  • \"minmax\": Replace outlier with the min/max of the column.
                                                                                                                                                                                  • Any numerical value with which to replace the outliers.

                                                                                                                                                                                  max_sigma: int or float, default=3 Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. Only if strategy=\"zscore\".

                                                                                                                                                                                  include_target: bool, default=False Whether to include the target column in the search for outliers. This can be useful for regression tasks. Only if strategy=\"zscore\".

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator. If sequence of strategies, the params should be provided in a dict with the strategy's name as key.

                                                                                                                                                                                  Attributes[strategy]_: sklearn estimator Object used to prune the data, e.g., pruner.iforest for the isolation forest strategy. Not available for strategy=\"zscore\".

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Balancer Balance the number of samples per class in the target column.

                                                                                                                                                                                  Normalizer Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  Scaler Scale the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/pruner/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.prune(stratgey=\"iforest\", verbose=2)\n\nFitting Pruner...\nPruning outliers...\n --> Dropping 63 outliers.\n\n\n>>> # Note the reduced number of rows\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4          10.26         16.58           65.85      320.8          0.08877           0.08066         0.04358              0.02438         0.1669  ...            71.08       357.4            0.1461            0.22460           0.1783               0.08333          0.2691                  0.09479       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n501        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n502        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n503        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n504        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n505        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[506 rows x 31 columns]\n\n\n>>> atom.plot_distribution(columns=0)\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Normalizer\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> normalizer = Normalizer(verbose=2)\n>>> X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n>>> # Note the reduced number of rows\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/pruner/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the outlier strategy on the data.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Apply the outlier strategy on the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  series Transformed target column. Only returned if provided.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/scaler/", "title": "Scaler", "text": "

                                                                                                                                                                                  class atom.data_cleaning.Scaler(strategy=\"standard\", include_binary=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Scale the data.

                                                                                                                                                                                  Apply one of sklearn's scalers. Categorical columns are ignored.

                                                                                                                                                                                  This class can be accessed from atom through the scale method. Read more in the user guide.

                                                                                                                                                                                  Parametersstrategy: str, default=\"standard\" Strategy with which to scale the data. Choose from:

                                                                                                                                                                                  • \"standard\": Remove mean and scale to unit variance.
                                                                                                                                                                                  • \"minmax\": Scale features to a given range.
                                                                                                                                                                                  • \"maxabs\": Scale features by their maximum absolute value.
                                                                                                                                                                                  • \"robust\": Scale using statistics that are robust to outliers.

                                                                                                                                                                                  include_binary: bool, default=False Whether to scale binary columns (only 0s and 1s).

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                  Attributes[strategy]_: sklearn transformer Object with which the data is scaled, e.g., scaler.standard for the default strategy.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  Balancer Balance the number of samples per class in the target column.

                                                                                                                                                                                  Normalizer Transform the data to follow a Normal/Gaussian distribution.

                                                                                                                                                                                  Scaler Scale the data.

                                                                                                                                                                                  "}, {"location": "API/data_cleaning/scaler/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n>>> atom.scale(verbose=2)\n\nFitting Scaler...\nScaling features...\n\n\n>>> # Note the reduced number of rows\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.181875      0.356669       -0.147122  -0.270991         0.340268          0.381628        0.214571             0.125567      -0.345050  ...         0.000933   -0.246244          1.240292           1.077359         1.116229              1.667157       -0.162964                 1.326816       0\n1       1.162216      0.300578        1.159704   1.097856         0.707625          0.368288        0.852572             1.148598       0.172744  ...         1.025723    1.042996          0.719898          -0.011475         0.500961              0.537309        0.280594                -0.308640       0\n2       1.056470      1.212060        0.933833   0.950360        -0.581659         -0.670877       -0.407166            -0.051653      -1.018183  ...         0.817241    0.746639         -0.060694          -0.482078        -0.311813             -0.027615       -0.666328                -0.259812       0\n3       0.277287      2.457753        0.188054   0.174273        -0.959614         -1.132432       -0.534892            -0.562913       0.143156  ...         0.083151    0.080948         -0.797185          -1.010314        -0.569828             -0.750385        0.544735                -1.284055       0\n4      -1.442482     -0.825921       -1.343434  -1.143186        -0.152840          0.358760        0.042209            -0.672815      -1.879941  ...        -1.289891   -1.052061         -0.660471          -0.435018        -0.571280             -1.162598       -2.081728                -0.389638       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.057446     -1.361124        0.018651  -0.043220         0.160827         -0.557108       -0.404013            -0.087607       0.967929  ...         0.091960   -0.018751         -0.140077          -0.663228        -0.528681             -0.101629        0.260659                -1.337478       1\n565    -0.268141      0.588045       -0.267318  -0.347933         0.025188         -0.014753       -0.084382             0.077883      -0.248889  ...        -0.051921   -0.245730          0.768409           0.870422         0.485954              0.683827        1.308918                 0.699518       0\n566     0.881154     -0.517419        0.845098   0.753978         0.283751         -0.026187        0.470528             0.868616      -0.001087  ...         0.693914    0.578760          0.384728          -0.095926         0.316526              1.061450        0.386915                -0.165028       0\n567     0.986900      0.337972        1.022568   0.852586         1.039660          1.162956        1.213182             1.426285       0.583281  ...         0.752641    0.715804          0.393548           0.608690         0.415763              0.544861       -0.734440                -0.380446       0\n568     0.185455     -0.381865        0.154577   0.050111        -0.352767         -0.315850       -0.612688            -0.685055       0.294796  ...        -0.040176   -0.093611         -0.453195          -0.433728        -0.321494             -0.488617        1.154420                -0.640672       1\n\n[569 rows x 31 columns]\n
                                                                                                                                                                                  >>> from atom.data_cleaning import Scaler\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> scaler = Scaler(verbose=2)\n>>> X = scaler.fit_transform(X)\n\nFitting Scaler...\nScaling features...\n\n\n>>> # Note the reduced number of rows\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.097064     -2.073335        1.269934   0.984375         1.568466          3.283515        2.652874             2.532475       2.217515  ...      -1.359293         2.303601    2.001237          1.307686           2.616665         2.109526              2.296076        2.750622                 1.937015\n1       1.829821     -0.353632        1.685955   1.908708        -0.826962         -0.487072       -0.023846             0.548144       0.001392  ...      -0.369203         1.535126    1.890489         -0.375612          -0.430444        -0.146749              1.087084       -0.243890                 0.281190\n2       1.579888      0.456187        1.566503   1.558884         0.942210          1.052926        1.363478             2.037231       0.939685  ...      -0.023974         1.347475    1.456285          0.527407           1.082932         0.854974              1.955000        1.152255                 0.201391\n3      -0.768909      0.253732       -0.592687  -0.764464         3.283553          3.402909        1.915897             1.451707       2.867383  ...       0.133984        -0.249939   -0.550021          3.394275           3.893397         1.989588              2.175786        6.046041                 4.935010\n4       1.750297     -1.151816        1.776573   1.826229         0.280372          0.539340        1.371011             1.428493      -0.009560  ...      -1.466770         1.338539    1.220724          0.220556          -0.313395         0.613179              0.729259       -0.868353                -0.397100\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     2.110995      0.721473        2.060786   2.343856         1.041842          0.219060        1.947285             2.320965      -0.312589  ...       0.117700         1.752563    2.015301          0.378365          -0.273318         0.664512              1.629151       -1.360158                -0.709091\n565     1.704854      2.085134        1.615931   1.723842         0.102458         -0.017833        0.693043             1.263669      -0.217664  ...       2.047399         1.421940    1.494959         -0.691230          -0.394820         0.236573              0.733827       -0.531855                -0.973978\n566     0.702284      2.045574        0.672676   0.577953        -0.840484         -0.038680        0.046588             0.105777      -0.809117  ...       1.374854         0.579001    0.427906         -0.809587           0.350735         0.326767              0.414069       -1.104549                -0.318409\n567     1.838341      2.336457        1.982524   1.735218         1.525767          3.272144        3.296944             2.658866       2.137194  ...       2.237926         2.303601    1.653171          1.430427           3.904848         3.197605              2.289985        1.919083                 2.219635\n568    -1.808401      1.221792       -1.814389  -1.347789        -3.112085         -1.150752       -1.114873            -1.261820      -0.820070  ...       0.764190        -1.432735   -1.075813         -1.859019          -1.207552        -1.305831             -1.745063       -0.048138                -0.751207\n\n[569 rows x 30 columns]\n
                                                                                                                                                                                  "}, {"location": "API/data_cleaning/scaler/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformPerform standardization by centering and scaling.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Scaled dataframe.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Perform standardization by centering and scaling.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Scaled dataframe.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureextractor/", "title": "FeatureExtractor", "text": "

                                                                                                                                                                                  class atom.feature_engineering.FeatureExtractor(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, verbose=0, logger=None)[source]Extract features from datetime columns.

                                                                                                                                                                                  Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

                                                                                                                                                                                  This class can be accessed from atom through the feature_extraction method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  Decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos features should be considered one single coordinate system.

                                                                                                                                                                                  Parametersfeatures: str or sequence, default=(\"day\", \"month\", \"year\") Features to create from the datetime columns. Note that created features with zero variance (e.g., the feature hour in a column that only contains dates) are ignored. Allowed values are datetime attributes from pandas.Series.dt.

                                                                                                                                                                                  fmt: str, sequence or None, default=None Format (strptime) of the categorical columns that need to be converted to datetime. If sequence, the n-th format corresponds to the n-th categorical column that can be successfully converted. If None, the format is inferred automatically from the first non NaN value. Values that cannot be converted are returned as NaT.

                                                                                                                                                                                  encoding_type: str, default=\"ordinal\" Type of encoding to use. Choose from:

                                                                                                                                                                                  • \"ordinal\": Encode features in increasing order.
                                                                                                                                                                                  • \"cyclic\": Encode features using sine and cosine to capture their cyclic nature. This approach creates two columns for every feature. Non-cyclic features still use ordinal encoding.

                                                                                                                                                                                  drop_columns: bool, default=True Whether to drop the original columns after transformation.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  FeatureGenerator Generate new features.

                                                                                                                                                                                  FeatureGrouper Extract statistics from similar features.

                                                                                                                                                                                  FeatureSelector Reduce the number of features in the data.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureextractor/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> import pandas as pd\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a datetime column\n>>> X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_extraction(features=[\"day\"], fmt=\"%d/%m/%Y\", verbose=2)\n\nFitting FeatureExtractor...\nExtracting datetime features...\n --> Extracting features from column date.\n   --> Creating feature date_day.\n\n\n>>> # Note the date_day column\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day  target\n0         12.770         22.47           81.72      506.3          0.09055           0.05761         0.04711              0.02704         0.1585  ...       653.6            0.1419             0.1523           0.2177               0.09331          0.2829                  0.08067        16       0\n1         27.420         26.27          186.90     2501.0          0.10840           0.19880         0.36350              0.16890         0.2061  ...      4254.0            0.1357             0.4256           0.6833               0.26250          0.2641                  0.07427         7       0\n2         15.850         23.95          103.70      782.7          0.08401           0.10020         0.09938              0.05364         0.1847  ...       876.5            0.1131             0.1924           0.2322               0.11190          0.2809                  0.06287        14       0\n3         14.190         23.81           92.87      610.7          0.09463           0.13060         0.11150              0.06462         0.2235  ...       811.3            0.1559             0.4059           0.3744               0.17720          0.4724                  0.10260         3       0\n4          8.950         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...       270.0            0.1179             0.1879           0.1544               0.03846          0.1652                  0.07722        27       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...         ...               ...                ...              ...                   ...             ...                      ...       ...     ...\n564       10.800         21.98           68.79      359.9          0.08801           0.05743         0.03614              0.01404         0.2016  ...       489.5            0.1303             0.1696           0.1927               0.07485          0.2965                  0.07662         4       1\n565       11.930         10.91           76.14      442.7          0.08872           0.05242         0.02606              0.01796         0.1601  ...       589.5            0.1374             0.1575           0.1514               0.06876          0.2460                  0.07262         6       1\n566       24.630         21.60          165.50     1841.0          0.10300           0.21060         0.23100              0.14710         0.1991  ...      2642.0            0.1342             0.4188           0.4658               0.24750          0.3157                  0.09671         6       0\n567        6.981         13.43           43.79      143.5          0.11700           0.07568         0.00000              0.00000         0.1930  ...       185.2            0.1584             0.1202           0.0000               0.00000          0.2932                  0.09382        12       1\n568       15.050         19.07           97.26      701.9          0.09215           0.08597         0.07486              0.04335         0.1561  ...       967.0            0.1246             0.2101           0.2866               0.11200          0.2282                  0.06954        30       0\n\n[569 rows x 32 columns]\n
                                                                                                                                                                                  >>> import pandas as pd\n>>> from atom.feature_engineering import FeatureExtractor\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a datetime column\n>>> X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n>>> fe = FeatureExtractor(features=[\"day\"], fmt=\"%Y-%m-%d\", verbose=2)\n>>> X = fe.transform(X)\n\nExtracting datetime features...\n --> Extracting features from column date.\n   --> Creating feature date_day.\n\n\n>>> # Note the date_day column\n>>> print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890         1\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902         2\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758         3\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300         4\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678         5\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...       ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115        19\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637        20\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820        21\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400        22\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039        23\n\n[569 rows x 31 columns]\n
                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureextractor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformExtract the new features.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Extract the new features.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregenerator/", "title": "FeatureGenerator", "text": "

                                                                                                                                                                                  class atom.feature_engineering.FeatureGenerator(strategy=\"dfs\", n_features=None, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Generate new features.

                                                                                                                                                                                  Create new combinations of existing features to capture the non-linear relations between the original features.

                                                                                                                                                                                  This class can be accessed from atom through the feature_generation method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom's nans attribute.
                                                                                                                                                                                  • When using dfs with n_jobs>1, make sure to protect your code with if __name__ == \"__main__\". Featuretools uses dask, which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task.
                                                                                                                                                                                  • gfg can be slow for very large populations.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  dfs can create many new features and not all of them will be useful. Use the FeatureSelector class to reduce the number of features.

                                                                                                                                                                                  Parametersstrategy: str, default=\"dfs\" Strategy to crate new features. Choose from:

                                                                                                                                                                                  • \"dfs\": Deep Feature Synthesis.
                                                                                                                                                                                  • \"gfg\": Genetic Feature Generation.

                                                                                                                                                                                  n_features: int or None, default=None Maximum number of newly generated features to add to the dataset. If None, select all created features.

                                                                                                                                                                                  operators: str, sequence or None, default=None Mathematical operators to apply on the features. None to use all. Choose from: add, sub, mul, div, abs, sqrt, log, inv, sin, cos, tan.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  **kwargs Additional keyword arguments for the SymbolicTransformer instance. Only for the gfg strategy.

                                                                                                                                                                                  Attributesgfg_: SymbolicTransformer Object used to calculate the genetic features. Only available when strategy=\"gfg\".

                                                                                                                                                                                  genetic_features_: pd.DataFrame Information on the newly created non-linear features. Only available when strategy=\"gfg\". Columns include:

                                                                                                                                                                                  • name: Name of the feature (generated automatically).
                                                                                                                                                                                  • description: Operators used to create this feature.
                                                                                                                                                                                  • fitness: Fitness score.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  FeatureExtractor Extract features from datetime columns.

                                                                                                                                                                                  FeatureGrouper Extract statistics from similar features.

                                                                                                                                                                                  FeatureSelector Reduce the number of features in the data.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregenerator/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_generation(strategy=\"dfs\", n_features=5, verbose=2)\n\nFitting FeatureGenerator...\nGenerating new features...\n --> 5 new features were added.\n\n\n>>> # Note the texture error / worst symmetry column\n>>> print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  mean concave points * smoothness error  mean concavity + worst radius  mean radius / smoothness error  worst concave points * worst radius  worst radius / concave points error  target\n0         13.280         13.72           85.79      541.8          0.08363           0.08575  ...                                0.000122                       14.29077                     3109.342074                             1.306235                          1681.624941       1\n1         15.460         11.89          102.50      736.9          0.12570           0.15550  ...                                0.000592                       18.99320                     2866.679028                             3.432933                          1423.484848       0\n2         13.110         15.56           87.21      530.2          0.13980           0.17650  ...                                0.000688                       16.51710                     1830.494275                             3.239166                          1175.072046       0\n3          9.847         15.68           63.00      293.2          0.09492           0.08419  ...                                0.000211                       11.26330                     1127.691251                             0.733747                          1652.698133       1\n4         14.870         20.21           96.12      680.9          0.09587           0.08345  ...                                0.000268                       16.07824                     2746.075716                             1.628217                          1353.338969       1\n..           ...           ...             ...        ...              ...               ...  ...                                     ...                            ...                             ...                                  ...                                  ...     ...\n564       14.470         24.99           95.81      656.4          0.08837           0.12300  ...                                0.000278                       16.32090                     2027.178481                             1.954510                          1395.869191       1\n565       19.690         21.25          130.00     1203.0          0.10960           0.15990  ...                                0.000787                       23.76740                     3201.626016                             5.727510                          1145.286686       0\n566       19.270         26.47          127.90     1162.0          0.09401           0.17190  ...                                0.000381                       24.31570                     3842.472582                             4.310775                          2504.407342       0\n567       11.760         18.14           75.00      431.1          0.09968           0.05914  ...                                0.000197                       13.38685                     2101.501072                             0.956576                           932.960894       0\n568       14.580         13.66           94.29      658.8          0.09832           0.08918  ...                                0.000215                       16.84222                     2943.670503                             1.539574                          1938.020352       1\n\n[569 rows x 36 columns]\n
                                                                                                                                                                                  >>> from atom.feature_engineering import FeatureGenerator\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fg = FeatureGenerator(strategy=\"dfs\", n_features=5, verbose=2)\n>>> X = fg.fit_transform(X, y)\n\nFitting FeatureGenerator...\nGenerating new features...\n --> 5 new features were added.\n\n\n>>> # Note the radius error * worst smoothness column\n>>> print(X)\n\n       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  worst fractal dimension  mean area - perimeter error  mean texture * worst fractal dimension  symmetry error / concave points error  texture error * worst area  worst radius / compactness error\nindex                                                                                           ...                                                                                                                                                                                                   \n0            17.99         10.38          122.80     1001.0          0.11840           0.27760  ...                  0.11890                      992.411                                1.234182                               1.892250                   1827.8007                        517.536705\n1            20.57         17.77          132.90     1326.0          0.08474           0.07864  ...                  0.08902                     1322.602                                1.581885                               1.036567                   1435.5084                       1910.550459\n2            19.69         21.25          130.00     1203.0          0.10960           0.15990  ...                  0.08758                     1198.415                                1.861075                               1.093294                   1344.8121                        588.367449\n3            11.42         20.38           77.58      386.1          0.14250           0.28390  ...                  0.17300                      382.655                                3.525740                               3.193894                    656.2612                        199.919549\n4            20.29         14.34          135.10     1297.0          0.10030           0.13280  ...                  0.07678                     1291.562                                1.101025                               0.931565                   1230.5475                        915.887850\n...            ...           ...             ...        ...              ...               ...  ...                      ...                          ...                                     ...                                    ...                         ...                               ...\n564          21.56         22.39          142.00     1479.0          0.11100           0.11590  ...                  0.07115                     1471.327                                1.593049                               0.453953                   2545.9120                        880.318229\n565          20.13         28.25          131.20     1261.0          0.09780           0.10340  ...                  0.06637                     1255.797                                1.874953                               1.131108                   4263.4530                        977.713578\n566          16.60         28.08          108.30      858.1          0.08455           0.10230  ...                  0.07820                      854.675                                2.195856                               0.846500                   1208.3000                        508.710801\n567          20.60         29.33          140.10     1265.0          0.11780           0.27700  ...                  0.12400                     1259.228                                3.636920                               1.396635                   2904.4950                        417.992855\n568           7.76         24.54           47.92      181.0          0.05263           0.04362  ...                  0.07039                      178.452                                1.727371                                    inf                    383.5608                       2029.184549\n\n[569 rows x 35 columns]\n
                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregenerator/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGenerate new features.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Generate new features.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregrouper/", "title": "FeatureGrouper", "text": "

                                                                                                                                                                                  class atom.feature_engineering.FeatureGrouper(groups, operators=None, drop_columns=True, verbose=0, logger=None)[source]Extract statistics from similar features.

                                                                                                                                                                                  Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

                                                                                                                                                                                  This class can be accessed from atom through the feature_grouping method. Read more in the user guide.

                                                                                                                                                                                  Parametersgroups: dict Group names and features. A feature can belong to multiple groups.

                                                                                                                                                                                  operators: str, sequence or None, default=None Statistical operators to apply on the groups. Any operator from numpy or scipy.stats (checked in that order) that is applied on an array can be used. If None, it uses: min, max, mean, median, mode and std.

                                                                                                                                                                                  drop_columns: bool, default=True Whether to drop the columns in groups after transformation.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  FeatureExtractor Extract features from datetime columns.

                                                                                                                                                                                  FeatureGenerator Generate new features.

                                                                                                                                                                                  FeatureSelector Reduce the number of features in the data.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregrouper/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_grouping({\"group1\": \"mean.*\"}, verbose=2)\n\nFitting FeatureGrouper...\nGrouping features...\n --> Group group1 successfully created.\n\n\n>>> print(atom.dataset)\n\n     radius error  texture error  perimeter error  area error  smoothness error  compactness error  concavity error  concave points error  symmetry error  ...  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)  target\n0          0.5190         2.9100            5.801       67.10          0.007545           0.060500         0.021340              0.018430         0.03056  ...          0.2311                  0.09203      0.07224       1132.0    130.736684        0.186400       0.07224   335.890773       0\n1          0.4564         1.0750            3.425       48.55          0.005903           0.037310         0.047300              0.015570         0.01318  ...          0.2218                  0.07820      0.05302        858.1    101.162786        0.130650       0.05302   254.320568       0\n2          0.2298         0.9988            1.534       22.18          0.002826           0.009105         0.013110              0.005174         0.01013  ...          0.2683                  0.06829      0.02847        758.6     89.400425        0.116550       0.02847   224.981976       0\n3          0.3117         0.8155            1.972       27.94          0.005217           0.015150         0.016780              0.012680         0.01669  ...          0.2723                  0.07071      0.05723        761.7     89.389875        0.138110       0.09462   226.081026       1\n4          0.3336         1.8600            2.041       19.91          0.011880           0.037470         0.045910              0.015440         0.02287  ...          0.2383                  0.09026      0.03068        334.2     43.414796        0.161250       0.03068    99.030712       1\n..            ...            ...              ...         ...               ...                ...              ...                   ...             ...  ...             ...                      ...          ...          ...           ...             ...           ...          ...     ...\n564        0.4727         1.2400            3.195       45.40          0.005718           0.011620         0.019980              0.011090         0.01410  ...          0.3029                  0.08216      0.05259        684.5     81.456503        0.128635       0.05259   202.924880       0\n565        0.8601         1.4800            7.029      111.70          0.008124           0.036110         0.054890              0.027650         0.03176  ...          0.2909                  0.05865      0.05024       1290.0    146.813205        0.170250       0.05024   383.094862       0\n566        0.2094         0.7636            1.231       17.67          0.008725           0.020030         0.023350              0.011320         0.02625  ...          0.3380                  0.09584      0.03370        513.7     62.632288        0.136750       0.03370   152.314252       1\n567        0.2818         0.7614            1.808       18.54          0.006142           0.006134         0.001835              0.003576         0.01637  ...          0.2738                  0.07685      0.00309        366.8     45.967364        0.109675       0.00309   108.819747       1\n568        0.2810         0.8135            3.369       23.81          0.004929           0.066570         0.076830              0.013680         0.01526  ...          0.2845                  0.12490      0.02833        542.9     66.369889        0.141200       0.02833   160.878141       1\n\n[569 rows x 27 columns]\n
                                                                                                                                                                                  >>> from atom.feature_engineering import FeatureGrouper\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fg = FeatureGrouper({\"group1\": [\"mean texture\", \"mean radius\"]}, verbose=2)\n>>> X = fg.transform(X)\n\nGrouping features...\n --> Group group1 successfully created.\n\n\n>>> print(X)\n\n     mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  mean fractal dimension  radius error  ...  worst concave points  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)\n0            122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419                 0.07871        1.0950  ...                0.2654          0.4601                  0.11890        10.38        17.99        14.185          14.185         10.38        3.805\n1            132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812                 0.05667        0.5435  ...                0.1860          0.2750                  0.08902        17.77        20.57        19.170          19.170         17.77        1.400\n2            130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069                 0.05999        0.7456  ...                0.2430          0.3613                  0.08758        19.69        21.25        20.470          20.470         19.69        0.780\n3             77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597                 0.09744        0.4956  ...                0.2575          0.6638                  0.17300        11.42        20.38        15.900          15.900         11.42        4.480\n4            135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809                 0.05883        0.7572  ...                0.1625          0.2364                  0.07678        14.34        20.29        17.315          17.315         14.34        2.975\n..              ...        ...              ...               ...             ...                  ...            ...                     ...           ...  ...                   ...             ...                      ...          ...          ...           ...             ...           ...          ...\n564          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726                 0.05623        1.1760  ...                0.2216          0.2060                  0.07115        21.56        22.39        21.975          21.975         21.56        0.415\n565          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752                 0.05533        0.7655  ...                0.1628          0.2572                  0.06637        20.13        28.25        24.190          24.190         20.13        4.060\n566          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590                 0.05648        0.4564  ...                0.1418          0.2218                  0.07820        16.60        28.08        22.340          22.340         16.60        5.740\n567          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397                 0.07016        0.7260  ...                0.2650          0.4087                  0.12400        20.60        29.33        24.965          24.965         20.60        4.365\n568           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587                 0.05884        0.3857  ...                0.0000          0.2871                  0.07039         7.76        24.54        16.150          16.150          7.76        8.390\n\n[569 rows x 34 columns]\n
                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featuregrouper/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGroup features.

                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Group features.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureselector/", "title": "FeatureSelector", "text": "

                                                                                                                                                                                  class atom.feature_engineering.FeatureSelector(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", verbose=0, logger=None, random_state=None, **kwargs)[source]Reduce the number of features in the data.

                                                                                                                                                                                  Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.

                                                                                                                                                                                  This class can be accessed from atom through the feature_selection method. Read more in the user guide.

                                                                                                                                                                                  Warning

                                                                                                                                                                                  • Ties between features with equal scores are broken in an unspecified way.
                                                                                                                                                                                  • For strategy=\"rfecv\", the n_features parameter is the minimum number of features to select, not the actual number of features that the transformer returns. It may very well be that it returns more!

                                                                                                                                                                                  Info

                                                                                                                                                                                  • The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"pca\" with dense datasets.
                                                                                                                                                                                  • If strategy=\"pca\" and the data is dense and unscaled, it's scaled to mean=0 and std=1 before fitting the PCA transformer.
                                                                                                                                                                                  • If strategy=\"pca\" and the provided data is sparse, the used estimator is TruncatedSVD, which works more efficiently with sparse matrices.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  • Use the plot_pca and plot_components methods to examine the results after using strategy=\"pca\".
                                                                                                                                                                                  • Use the plot_rfecv method to examine the results after using strategy=\"rfecv\".
                                                                                                                                                                                  • Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead.

                                                                                                                                                                                  Parametersstrategy: str or None, default=None Feature selection strategy to use. Choose from:

                                                                                                                                                                                  • None: Do not perform any feature selection strategy.
                                                                                                                                                                                  • \"univariate\": Univariate statistical F-test.
                                                                                                                                                                                  • \"pca\": Principal Component Analysis.
                                                                                                                                                                                  • \"sfm\": Select best features according to a model.
                                                                                                                                                                                  • \"sfs\": Sequential Feature Selection.
                                                                                                                                                                                  • \"rfe\": Recursive Feature Elimination.
                                                                                                                                                                                  • \"rfecv\": RFE with cross-validated selection.
                                                                                                                                                                                  • \"pso\": Particle Swarm Optimization.
                                                                                                                                                                                  • \"hho\": Harris Hawks Optimization.
                                                                                                                                                                                  • \"gwo\": Grey Wolf Optimization.
                                                                                                                                                                                  • \"dfo\": Dragonfly Optimization.
                                                                                                                                                                                  • \"go\": Genetic Optimization.

                                                                                                                                                                                  solver: str, func, estimator or None, default=None Solver/estimator to use for the feature selection strategy. See the corresponding documentation for an extended description of the choices. If None, the default value is used (only if strategy=\"pca\"). Choose from:

                                                                                                                                                                                  • If strategy=\"univariate\":

                                                                                                                                                                                    • \"f_classif\"
                                                                                                                                                                                    • \"f_regression\"
                                                                                                                                                                                    • \"mutual_info_classif\"
                                                                                                                                                                                    • \"mutual_info_regression\"
                                                                                                                                                                                    • \"chi2\"
                                                                                                                                                                                    • Any function with signature func(X, y) -> tuple[scores, p-values].
                                                                                                                                                                                  • If strategy=\"pca\":

                                                                                                                                                                                    • If data is dense:

                                                                                                                                                                                      • If engine=\"sklearn\":

                                                                                                                                                                                        • \"auto\" (default)
                                                                                                                                                                                        • \"full\"
                                                                                                                                                                                        • \"arpack\"
                                                                                                                                                                                        • \"randomized\"
                                                                                                                                                                                      • If engine=\"sklearnex\":

                                                                                                                                                                                        • \"full\" (default)
                                                                                                                                                                                      • If engine=\"cuml\":

                                                                                                                                                                                        • \"full\" (default)
                                                                                                                                                                                        • \"jacobi\"
                                                                                                                                                                                    • If data is sparse:

                                                                                                                                                                                      • \"randomized\" (default)
                                                                                                                                                                                      • \"arpack\"
                                                                                                                                                                                  • for the remaining strategies: The base estimator. For sfm, rfe and rfecv, it should have either a feature_importances_ or coef_ attribute after fitting. You can use one of the predefined models. Add _class or _reg after the model's name to specify a classification or regression task, e.g., solver=\"LGB_reg\" (not necessary if called from atom). No default option.

                                                                                                                                                                                  n_features: int, float or None, default=None Number of features to select.

                                                                                                                                                                                  • If None: Select all features.
                                                                                                                                                                                  • If <1: Fraction of the total features to select.
                                                                                                                                                                                  • If >=1: Number of features to select.

                                                                                                                                                                                  If strategy=\"sfm\" and the threshold parameter is not specified, the threshold is automatically set to -inf to select n_features number of features.

                                                                                                                                                                                  If strategy=\"rfecv\", n_features is the minimum number of features to select.

                                                                                                                                                                                  This parameter is ignored if any of the following strategies is selected: pso, hho, gwo, dfo, go.

                                                                                                                                                                                  min_repeated: int, float or None, default=2 Remove categorical features if there isn't any repeated value in at least min_repeated rows. The default is to keep all features with non-maximum variance, i.e., remove the features which number of unique values is equal to the number of rows (usually the case for names, IDs, etc...).

                                                                                                                                                                                  • If None: No check for minimum repetition.
                                                                                                                                                                                  • If >1: Minimum repetition number.
                                                                                                                                                                                  • If <=1: Minimum repetition fraction.

                                                                                                                                                                                  max_repeated: int, float or None, default=1.0 Remove categorical features with the same value in at least max_repeated rows. The default is to keep all features with non-zero variance, i.e., remove the features that have the same value in all samples.

                                                                                                                                                                                  • If None: No check for maximum repetition.
                                                                                                                                                                                  • If >1: Maximum number of repeated occurences.
                                                                                                                                                                                  • If <=1: Maximum fraction of repeated occurences.

                                                                                                                                                                                  max_correlation: float or None, default=1.0 Minimum absolute Pearson correlation to identify correlated features. For each group, it removes all except the feature with the highest correlation to y (if provided, else it removes all but the first). The default value removes equal columns. If None, skip this step.

                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                  **kwargs Any extra keyword argument for the strategy estimator. See the corresponding documentation for the available options.

                                                                                                                                                                                  Attributescollinear_: pd.DataFrame Information on the removed collinear features. Columns include:

                                                                                                                                                                                  • drop: Name of the dropped feature.
                                                                                                                                                                                  • corr_feature: Names of the correlated features.
                                                                                                                                                                                  • corr_value: Corresponding correlation coefficients.

                                                                                                                                                                                  [strategy]_: sklearn transformer Object used to transform the data, e.g., fs.pca for the pca strategy.

                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  FeatureExtractor Extract features from datetime columns.

                                                                                                                                                                                  FeatureGenerator Generate new features.

                                                                                                                                                                                  FeatureGrouper Extract statistics from similar features.

                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureselector/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.feature_selection(strategy=\"pca\", n_features=12, verbose=2)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 12 components.\n   --> Explained variance ratio: 0.971\n\n\n>>> # Note that the column names changed\n>>> print(atom.dataset)\n\n         pca0      pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11  target\n0    1.933532  2.215152  1.268851 -1.776239  0.069615 -0.043647  0.281363  0.122942 -0.911086 -0.223754 -0.086316 -0.929486       1\n1    1.203025  6.706587  4.445104  0.087116  3.044271 -1.130720  0.820790 -0.593311 -1.004105  0.945411 -0.199241  0.948766       1\n2    4.506063 -1.419715 -1.216228  1.189962  0.227850  0.788522 -0.829805  0.521853 -0.381054  0.676945  0.004564  0.066630       0\n3   -2.179059  0.496110 -0.870279 -0.151235 -0.715354  0.983901 -0.232186  0.449653  0.350218  0.644448  0.280308 -0.544707       1\n4    0.708048  0.859536 -2.683579  0.295765  0.712158 -1.105250 -0.226270 -0.264257  0.494656 -0.643629 -0.152528 -0.008835       0\n..        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...     ...\n564 -2.477152 -1.482251 -0.389774 -0.333742  0.627651 -0.475717 -0.048757 -0.337669  0.382336  0.132000  0.204445  0.118625       1\n565 -0.400165  0.078366 -2.082886 -1.024593  0.623709 -1.003931  0.571384  0.248557 -0.489957 -0.397008 -0.132552 -0.162104       0\n566 -2.956303 -0.111232 -0.770455  0.035805  0.308638  0.311849  0.119611 -0.994997  0.495694 -0.130586  0.214798  0.358027       1\n567 -5.409548 -0.784989  1.540835  2.205277  0.249963  1.552586  1.837439 -0.796343  0.508352  0.011600 -0.066693 -0.006518       1\n568 -3.648393 -1.340745  0.503077  4.546174 -0.221396  1.229170  0.687803  0.711380  0.527799  0.139843 -0.958308  0.834252       1\n\n[569 rows x 13 columns]\n
                                                                                                                                                                                  >>> from atom.feature_engineering import FeatureSelector\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> fs = FeatureSelector(strategy=\"pca\", n_features=12, verbose=2)\n>>> X = fs.fit_transform(X)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 12 components.\n   --> Explained variance ratio: 0.97\n\n\n>>> # Note that the column names changed\n>>> print(X)\n\n          pca0       pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11\n0     9.192837   1.948583 -1.123166  3.633731 -1.195110  1.411424  2.159370 -0.398407 -0.157118 -0.877402  0.262955 -0.859014\n1     2.387802  -3.768172 -0.529293  1.118264  0.621775  0.028656  0.013358  0.240988 -0.711905  1.106995  0.813120  0.157923\n2     5.733896  -1.075174 -0.551748  0.912083 -0.177086  0.541452 -0.668166  0.097374  0.024066  0.454275 -0.605604  0.124387\n3     7.122953  10.275589 -3.232790  0.152547 -2.960878  3.053422  1.429911  1.059565 -1.405440 -1.116975 -1.151514  1.011316\n4     3.935302  -1.948072  1.389767  2.940639  0.546747 -1.226495 -0.936213  0.636376 -0.263805  0.377704  0.651360 -0.110515\n..         ...        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...\n564   6.439315  -3.576817  2.459487  1.177314 -0.074824 -2.375193 -0.596130 -0.035471  0.987929  0.256989 -0.062651  0.123342\n565   3.793382  -3.584048  2.088476 -2.506028 -0.510723 -0.246710 -0.716326 -1.113360 -0.105207 -0.108632  0.244804  0.222753\n566   1.256179  -1.902297  0.562731 -2.089227  1.809991 -0.534447 -0.192758  0.341887  0.393917  0.520877 -0.840512  0.096473\n567  10.374794   1.672010 -1.877029 -2.356031 -0.033742  0.567936  0.223082 -0.280239 -0.542035 -0.089296 -0.178628 -0.697461\n568  -5.475243  -0.670637  1.490443 -2.299157 -0.184703  1.617837  1.698952  1.046354  0.374101 -0.047726 -0.144094 -0.179496\n\n[569 rows x 12 columns]\n
                                                                                                                                                                                  "}, {"location": "API/feature_engineering/featureselector/#methods", "title": "Methods", "text": "

                                                                                                                                                                                  fitFit the feature selector to the data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTransform the data.

                                                                                                                                                                                  method fit(X, y=None)[source]Fit the feature selector to the data.

                                                                                                                                                                                  The univariate, sfm (when model is not fitted), sfs, rfe and rfecv strategies need a target column. Leaving it None raises an exception.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                  method get_metadata_routing()[source]Get metadata routing of this object.

                                                                                                                                                                                  Returnsrouting : MetadataRequest A :class:~sklearn.utils.metadata_routing.MetadataRequest encapsulating routing information.

                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                  method transform(X, y=None)[source]Transform the data.

                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                  Returnsdataframe Transformed feature set.

                                                                                                                                                                                  "}, {"location": "API/models/adab/", "title": "AdaBoost", "text": "

                                                                                                                                                                                  AdaB accept sparse

                                                                                                                                                                                  AdaBoost is a meta-estimator that begins by fitting a classifier/regressor on the original dataset and then fits additional copies of the algorithm on the same dataset but where the weights of instances are adjusted according to the error of the current prediction.

                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                  • AdaBoostClassifier for classification tasks.
                                                                                                                                                                                  • AdaBoostRegressor for regression tasks.

                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                  See Also

                                                                                                                                                                                  GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                                  RandomForest Random Forest.

                                                                                                                                                                                  XGBoost Extreme Gradient Boosting.

                                                                                                                                                                                  "}, {"location": "API/models/adab/#example", "title": "Example", "text": "
                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"AdaB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: AdaB\nMetric: f1\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.221s\n-------------------------------------------------\nTime: 0.221s\n\n\nFinal results ==================== >>\nTotal time: 0.224s\n-------------------------------------\nAdaBoost --> f1: 0.9583\n
                                                                                                                                                                                  "}, {"location": "API/models/adab/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)algorithmCategoricalDistribution(choices=('SAMME.R', 'SAMME'))

                                                                                                                                                                                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)lossCategoricalDistribution(choices=('linear', 'square', 'exponential'))

                                                                                                                                                                                  "}, {"location": "API/models/adab/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/adab/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                  Tip

                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                  "}, {"location": "API/models/adab/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                      "}, {"location": "API/models/adab/#methods", "title": "Methods", "text": "

                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                      • Shap values
                                                                                                                                                                                      • App instance
                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                      Note

                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                      Tip

                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                      Warning

                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                      Info

                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                      Tip

                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                      "}, {"location": "API/models/ard/", "title": "AutomaticRelevanceDetermination", "text": "

                                                                                                                                                                                      ARD needs scaling

                                                                                                                                                                                      Automatic Relevance Determination is very similar to BayesianRidge, but can lead to sparser coefficients. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions.

                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                      • ARDRegression for regression tasks.

                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                      See Also

                                                                                                                                                                                      BayesianRidge Bayesian ridge regression.

                                                                                                                                                                                      GaussianProcess Gaussian process.

                                                                                                                                                                                      LeastAngleRegression Least Angle Regression.

                                                                                                                                                                                      "}, {"location": "API/models/ard/#example", "title": "Example", "text": "
                                                                                                                                                                                      >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"ARD\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: ARD\nMetric: r2\n\n\nResults for AutomaticRelevanceDetermination:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6029\nTime elapsed: 0.139s\n-------------------------------------------------\nTime: 0.139s\n\n\nFinal results ==================== >>\nTotal time: 0.140s\n-------------------------------------\nAutomaticRelevanceDetermination --> r2: 0.6029\n
                                                                                                                                                                                      "}, {"location": "API/models/ard/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                      Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                      "}, {"location": "API/models/ard/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ard/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                      Tip

                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                      "}, {"location": "API/models/ard/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                          "}, {"location": "API/models/ard/#methods", "title": "Methods", "text": "

                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                          • Shap values
                                                                                                                                                                                          • App instance
                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                          Note

                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                          Tip

                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                          Warning

                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                          Info

                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                          Tip

                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                          "}, {"location": "API/models/arima/", "title": "ARIMA", "text": "

                                                                                                                                                                                          ARIMA native multioutput

                                                                                                                                                                                          Seasonal ARIMA models and exogeneous input is supported, hence this estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.

                                                                                                                                                                                          An ARIMA model, is a generalization of an autoregressive moving average (ARMA) model, and is fitted to time-series data in an effort to forecast future points. ARIMA models can be especially efficacious in cases where data shows evidence of non-stationarity.

                                                                                                                                                                                          The \"AR\" part of ARIMA indicates that the evolving variable of interest is regressed on its own lagged (i.e., prior observed) values. The \"MA\" part indicates that the regression error is actually a linear combination of error terms whose values occurred contemporaneously and at various times in the past. The \"I\" (for \"integrated\") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once).

                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                          • ARIMA for forecasting tasks.

                                                                                                                                                                                          Warning

                                                                                                                                                                                          ARIMA often runs into numerical errors when optimizing the hyperparameters. Possible solutions are:

                                                                                                                                                                                          • Use the AutoARIMA model instead.
                                                                                                                                                                                          • Use est_params to specify the orders manually, e.g., atom.run(\"arima\", n_trials=5,est_params={\"order\": (1, 1, 0)}).
                                                                                                                                                                                          • Use the catch parameter in ht_params to avoid raising every exception, e.g., atom.run(\"arima\",n_trials=5, ht_params={\"catch\": (Exception,)}).

                                                                                                                                                                                          See Also

                                                                                                                                                                                          AutoARIMA Automatic Autoregressive Integrated Moving Average Model.

                                                                                                                                                                                          "}, {"location": "API/models/arima/#example", "title": "Example", "text": "
                                                                                                                                                                                          >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_longley\n\n>>> _, X = load_longley()\n\n>>> atom = ATOMForecaster(X)\n>>> atom.run(models=\"ARIMA\", verbose=2)\n\n\nTraining ========================= >>\nModels: ARIMA\nMetric: mape\n\n\nResults for ARIMA:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0131\nTest evaluation --> mape: -0.0364\nTime elapsed: 0.214s\n-------------------------------------------------\nTime: 0.214s\n\n\nFinal results ==================== >>\nTotal time: 0.215s\n-------------------------------------\nARIMA --> mape: -0.0364\n
                                                                                                                                                                                          "}, {"location": "API/models/arima/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                          ParameterspIntDistribution(high=2, log=False, low=0, step=1)dIntDistribution(high=1, log=False, low=0, step=1)qIntDistribution(high=2, log=False, low=0, step=1)PIntDistribution(high=2, log=False, low=0, step=1)DIntDistribution(high=1, log=False, low=0, step=1)QIntDistribution(high=2, log=False, low=0, step=1)SCategoricalDistribution(choices=(0, 4, 6, 7, 12))methodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                          "}, {"location": "API/models/arima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/arima/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                          Tip

                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                          "}, {"location": "API/models/arima/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                              "}, {"location": "API/models/arima/#methods", "title": "Methods", "text": "

                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                              • Shap values
                                                                                                                                                                                              • App instance
                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                              Note

                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                              Tip

                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                              Warning

                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                              method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                              method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                              method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                              method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                              method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                              method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                              method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                              Info

                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                              fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                              Tip

                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                              "}, {"location": "API/models/autoarima/", "title": "AutoARIMA", "text": "

                                                                                                                                                                                              AutoARIMA native multioutput

                                                                                                                                                                                              ARIMA implementation that includes automated fitting of (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA algorithm seeks to identify the most optimal parameters for an ARIMA model, settling on a single fitted ARIMA model. This process is based on the commonly-used R function.

                                                                                                                                                                                              AutoARIMA works by conducting differencing tests (i.e., Kwiatkowski\u2013Phillips\u2013Schmidt\u2013Shin, Augmented Dickey-Fuller or Phillips\u2013Perron) to determine the order of differencing, d, and then fitting models within defined ranges. AutoARIMA also seeks to identify the optimal P and Q hyperparameters after conducting the Canova-Hansen to determine the optimal order of seasonal differencing.

                                                                                                                                                                                              Note that due to stationarity issues, AutoARIMA might not find a suitable model that will converge. If this is the case, a ValueError is thrown suggesting stationarity-inducing measures be taken prior to re-fitting or that a new range of order values be selected.

                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                              • AutoARIMA for forecasting tasks.

                                                                                                                                                                                              See Also

                                                                                                                                                                                              ARIMA Autoregressive Integrated Moving Average Model.

                                                                                                                                                                                              ETS ETS model with automatic fitting capabilities.

                                                                                                                                                                                              "}, {"location": "API/models/autoarima/#example", "title": "Example", "text": "
                                                                                                                                                                                              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_longley\n\n>>> _, X = load_longley()\n\n>>> atom = ATOMForecaster(X, random_state=1)\n>>> atom.run(models=\"autoarima\", verbose=2)\n\n\nTraining ========================= >>\nModels: AutoARIMA\nMetric: mape\n\n\nResults for AutoARIMA:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0131\nTest evaluation --> mape: -0.0359\nTime elapsed: 0.437s\n-------------------------------------------------\nTime: 0.437s\n\n\nFinal results ==================== >>\nTotal time: 0.438s\n-------------------------------------\nAutoARIMA --> mape: -0.0359\n
                                                                                                                                                                                              "}, {"location": "API/models/autoarima/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                              ParametersmethodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                              "}, {"location": "API/models/autoarima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/autoarima/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                              Tip

                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                              "}, {"location": "API/models/autoarima/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                  "}, {"location": "API/models/autoarima/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                  Note

                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                  Tip

                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                  Warning

                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                  Info

                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                  Tip

                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                  "}, {"location": "API/models/bag/", "title": "Bagging", "text": "

                                                                                                                                                                                                  Bag accept sparse

                                                                                                                                                                                                  Bagging uses an ensemble meta-estimator that fits base predictors on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator by introducing randomization into its construction procedure and then making an ensemble out of it.

                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                  • BaggingClassifier for classification tasks.
                                                                                                                                                                                                  • BaggingRegressor for regression tasks.

                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                  See Also

                                                                                                                                                                                                  DecisionTree Single Decision Tree.

                                                                                                                                                                                                  LogisticRegression Logistic Regression.

                                                                                                                                                                                                  RandomForest Random Forest.

                                                                                                                                                                                                  "}, {"location": "API/models/bag/#example", "title": "Example", "text": "
                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Bag\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Bag\nMetric: f1\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9982\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 0.101s\n\n\nFinal results ==================== >>\nTotal time: 0.104s\n-------------------------------------\nBagging --> f1: 0.9444\n
                                                                                                                                                                                                  "}, {"location": "API/models/bag/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                  Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                  "}, {"location": "API/models/bag/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bag/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                  Tip

                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                  "}, {"location": "API/models/bag/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                      "}, {"location": "API/models/bag/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                      Note

                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                      Tip

                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                      Warning

                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                      Info

                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                      Tip

                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                      "}, {"location": "API/models/bnb/", "title": "BernoulliNB", "text": "

                                                                                                                                                                                                      BNB accept sparse supports acceleration

                                                                                                                                                                                                      BernoulliNB implements the Naive Bayes algorithm for multivariate Bernoulli models. Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MNB works with occurrence counts, BNB is designed for binary/boolean features.

                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                      • BernoulliNB for classification tasks.

                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                      See Also

                                                                                                                                                                                                      ComplementNB Complement Naive Bayes.

                                                                                                                                                                                                      CategoricalNB Categorical Naive Bayes.

                                                                                                                                                                                                      MultinomialNB Multinomial Naive Bayes.

                                                                                                                                                                                                      "}, {"location": "API/models/bnb/#example", "title": "Example", "text": "
                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"BNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: BNB\nMetric: f1\n\n\nResults for BernoulliNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7709\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== >>\nTotal time: 0.026s\n-------------------------------------\nBernoulliNB --> f1: 0.7717\n
                                                                                                                                                                                                      "}, {"location": "API/models/bnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                      "}, {"location": "API/models/bnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                      Tip

                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                      "}, {"location": "API/models/bnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                          "}, {"location": "API/models/bnb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                          Note

                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                          Tip

                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                          Warning

                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                          Info

                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                          Tip

                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                          "}, {"location": "API/models/br/", "title": "BayesianRidge", "text": "

                                                                                                                                                                                                          BR needs scaling

                                                                                                                                                                                                          Bayesian regression techniques can be used to include regularization parameters in the estimation procedure: the regularization parameter is not set in a hard sense but tuned to the data at hand.

                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                          • BayesianRidge for regression tasks.

                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                          See Also

                                                                                                                                                                                                          AutomaticRelevanceDetermination Automatic Relevance Determination.

                                                                                                                                                                                                          GaussianProcess Gaussian process.

                                                                                                                                                                                                          LeastAngleRegression Least Angle Regression.

                                                                                                                                                                                                          "}, {"location": "API/models/br/#example", "title": "Example", "text": "
                                                                                                                                                                                                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"BR\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: BR\nMetric: r2\n\n\nResults for BayesianRidge:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.138s\n-------------------------------------------------\nTime: 0.138s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nBayesianRidge --> r2: 0.6028\n
                                                                                                                                                                                                          "}, {"location": "API/models/br/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                          Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                          "}, {"location": "API/models/br/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/br/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                          Tip

                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                          "}, {"location": "API/models/br/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                              "}, {"location": "API/models/br/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                              Note

                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                              Tip

                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                              Warning

                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                              Info

                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                              Tip

                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                              "}, {"location": "API/models/catb/", "title": "CatBoost", "text": "

                                                                                                                                                                                                              CatB needs scaling accept sparse allows validation supports acceleration

                                                                                                                                                                                                              CatBoost is a machine learning method based on gradient boosting over decision trees. Main advantages of CatBoost:

                                                                                                                                                                                                              • Superior quality when compared with other GBDT models on many datasets.
                                                                                                                                                                                                              • Best in class prediction speed.

                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                              • CatBoostClassifier for classification tasks.
                                                                                                                                                                                                              • CatBoostRegressor for regression tasks.

                                                                                                                                                                                                              Read more in CatBoost's documentation.

                                                                                                                                                                                                              Warning

                                                                                                                                                                                                              • CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the use_best_model=False parameter to avoid this behavior or use a holdout set to evaluate the final estimator.
                                                                                                                                                                                                              • In-training validation and pruning are disabled when device=\"gpu\".

                                                                                                                                                                                                              Note

                                                                                                                                                                                                              ATOM uses CatBoost's n_estimators parameter instead of iterations to indicate the number of trees to fit. This is done to have consistent naming with the XGBoost and LightGBM models.

                                                                                                                                                                                                              See Also

                                                                                                                                                                                                              GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                                                              LightGBM Light Gradient Boosting Machine.

                                                                                                                                                                                                              XGBoost Extreme Gradient Boosting.

                                                                                                                                                                                                              "}, {"location": "API/models/catb/#example", "title": "Example", "text": "
                                                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CatB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CatB\nMetric: f1\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 14.218s\n-------------------------------------------------\nTime: 14.218s\n\n\nFinal results ==================== >>\nTotal time: 14.221s\n-------------------------------------\nCatBoost --> f1: 0.9655\n
                                                                                                                                                                                                              "}, {"location": "API/models/catb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)

                                                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)

                                                                                                                                                                                                              "}, {"location": "API/models/catb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                              Tip

                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                              "}, {"location": "API/models/catb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                  "}, {"location": "API/models/catb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                  "}, {"location": "API/models/catnb/", "title": "CategoricalNB", "text": "

                                                                                                                                                                                                                  CatNB accept sparse supports acceleration

                                                                                                                                                                                                                  Categorical Naive Bayes implements the Naive Bayes algorithm for categorical features.

                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                  • CategoricalNB for classification tasks.

                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                  BernoulliNB Bernoulli Naive Bayes.

                                                                                                                                                                                                                  ComplementNB Complement Naive Bayes.

                                                                                                                                                                                                                  GaussianNB Gaussian Naive Bayes.

                                                                                                                                                                                                                  "}, {"location": "API/models/catnb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> import numpy as np\n\n>>> X = np.random.randint(5, size=(100, 100))\n>>> y = np.random.randint(2, size=100)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CatNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CatNB\nMetric: f1\n\n\nResults for CategoricalNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.4444\nTime elapsed: 0.029s\n-------------------------------------------------\nTime: 0.029s\n\n\nFinal results ==================== >>\nTotal time: 0.032s\n-------------------------------------\nCategoricalNB --> f1: 0.4444 ~\n
                                                                                                                                                                                                                  "}, {"location": "API/models/catnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                                                                                                                                                                                                  ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                  ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                  "}, {"location": "API/models/catnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                  "}, {"location": "API/models/catnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                      "}, {"location": "API/models/catnb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                      "}, {"location": "API/models/cnb/", "title": "ComplementNB", "text": "

                                                                                                                                                                                                                      CNB accept sparse supports acceleration

                                                                                                                                                                                                                      The Complement Naive Bayes classifier was designed to correct the \"severe assumptions\" made by the standard MultinomialNB classifier. It is particularly suited for imbalanced datasets.

                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                      • ComplementNB for classification tasks.

                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                      BernoulliNB Bernoulli Naive Bayes.

                                                                                                                                                                                                                      CategoricalNB Categorical Naive Bayes.

                                                                                                                                                                                                                      MultinomialNB Multinomial Naive Bayes.

                                                                                                                                                                                                                      "}, {"location": "API/models/cnb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"CNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: CNB\nMetric: f1\n\n\nResults for ComplementNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9221\nTest evaluation --> f1: 0.9128\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.023s\n-------------------------------------\nComplementNB --> f1: 0.9128\n
                                                                                                                                                                                                                      "}, {"location": "API/models/cnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                      "}, {"location": "API/models/cnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/cnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                      "}, {"location": "API/models/cnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                          "}, {"location": "API/models/cnb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                          "}, {"location": "API/models/dummy/", "title": "Dummy", "text": "

                                                                                                                                                                                                                          Dummy

                                                                                                                                                                                                                          When doing supervised learning, a simple sanity check consists of comparing one's estimator against simple rules of thumb. The prediction methods completely ignore the input data. Do not use this model for real problems. Use it only as a simple baseline to compare with other models.

                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                          • DummyClassifier for classification tasks.
                                                                                                                                                                                                                          • DummyRegressor for regression tasks.

                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                          DecisionTree Single Decision Tree.

                                                                                                                                                                                                                          ExtraTree Extremely Randomized Tree.

                                                                                                                                                                                                                          NaiveForecaster Naive Forecaster.

                                                                                                                                                                                                                          "}, {"location": "API/models/dummy/#example", "title": "Example", "text": "
                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Dummy\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Dummy\nMetric: f1\n\n\nResults for Dummy:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7709\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.021s\n-------------------------------------\nDummy --> f1: 0.7717\n
                                                                                                                                                                                                                          "}, {"location": "API/models/dummy/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                          ParametersstrategyCategoricalDistribution(choices=('most_frequent', 'prior', 'stratified', 'uniform'))

                                                                                                                                                                                                                          ParametersstrategyCategoricalDistribution(choices=('mean', 'median', 'quantile'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                          "}, {"location": "API/models/dummy/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/dummy/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                          "}, {"location": "API/models/dummy/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                              "}, {"location": "API/models/dummy/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                              "}, {"location": "API/models/en/", "title": "ElasticNet", "text": "

                                                                                                                                                                                                                              EN needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                              Linear least squares with l1 and l2 regularization.

                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                              • ElasticNet for regression tasks.

                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                              Lasso Linear Regression with lasso regularization.

                                                                                                                                                                                                                              OrdinaryLeastSquares Linear Regression.

                                                                                                                                                                                                                              Ridge Linear least squares with l2 regularization.

                                                                                                                                                                                                                              "}, {"location": "API/models/en/#example", "title": "Example", "text": "
                                                                                                                                                                                                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"EN\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: EN\nMetric: r2\n\n\nResults for ElasticNet:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.2061\nTest evaluation --> r2: 0.2016\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nElasticNet --> r2: 0.2016\n
                                                                                                                                                                                                                              "}, {"location": "API/models/en/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                              cpugpu

                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                              "}, {"location": "API/models/en/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/en/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                              "}, {"location": "API/models/en/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                  "}, {"location": "API/models/en/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                  "}, {"location": "API/models/es/", "title": "ExponentialSmoothing", "text": "

                                                                                                                                                                                                                                  ES native multioutput

                                                                                                                                                                                                                                  Holt-Winters exponential smoothing forecaster. The default settings use simple exponential smoothing, without trend and seasonality components.

                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                  • ExponentialSmoothing for forecasting tasks.

                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                  ARIMA Autoregressive Integrated Moving Average Model.

                                                                                                                                                                                                                                  ETS ETS model with automatic fitting capabilities.

                                                                                                                                                                                                                                  PolynomialTrend Polynomial Trend forecaster.

                                                                                                                                                                                                                                  "}, {"location": "API/models/es/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                  >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"ES\", verbose=2)\n\n\nTraining ========================= >>\nModels: ES\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0864\nTest evaluation --> mape: -0.2303\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.020s\n-------------------------------------\nExponentialSmoothing --> mape: -0.2303\n
                                                                                                                                                                                                                                  "}, {"location": "API/models/es/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                  ParameterstrendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(4, 6, 7, 12, None))use_boxcoxCategoricalDistribution(choices=(True, False))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))methodCategoricalDistribution(choices=('L-BFGS-B', 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'bh', 'ls'))

                                                                                                                                                                                                                                  "}, {"location": "API/models/es/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/es/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                  "}, {"location": "API/models/es/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                      "}, {"location": "API/models/es/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                      method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                      method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                      method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                                                                      method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                      method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                      method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                      method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                                                                      Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                      X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                      fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                      "}, {"location": "API/models/et/", "title": "ExtraTrees", "text": "

                                                                                                                                                                                                                                      ET accept sparse native multilabel native multioutput

                                                                                                                                                                                                                                      Extra-Trees use a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                      • ExtraTreesClassifier for classification tasks.
                                                                                                                                                                                                                                      • ExtraTreesRegressor for regression tasks.

                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                      DecisionTree Single Decision Tree.

                                                                                                                                                                                                                                      ExtraTree Extremely Randomized Tree.

                                                                                                                                                                                                                                      RandomForest Random Forest.

                                                                                                                                                                                                                                      "}, {"location": "API/models/et/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"ET\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: ET\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9655\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.110s\n\n\nFinal results ==================== >>\nTotal time: 0.112s\n-------------------------------------\nExtraTrees --> f1: 0.9655\n
                                                                                                                                                                                                                                      "}, {"location": "API/models/et/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                      "}, {"location": "API/models/et/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/et/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                      "}, {"location": "API/models/et/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                          "}, {"location": "API/models/et/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                          "}, {"location": "API/models/etree/", "title": "ExtraTree", "text": "

                                                                                                                                                                                                                                          ETree accept sparse native multilabel native multioutput

                                                                                                                                                                                                                                          Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the max_features randomly selected features and the best split among those is chosen. When max_features is set 1, this amounts to building a totally random decision tree.

                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                          • ExtraTreeClassifier for classification tasks.
                                                                                                                                                                                                                                          • ExtraTreeRegressor for regression tasks.

                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                          DecisionTree Single Decision Tree.

                                                                                                                                                                                                                                          ExtraTrees Extremely Randomized Trees.

                                                                                                                                                                                                                                          RandomForest Random Forest.

                                                                                                                                                                                                                                          "}, {"location": "API/models/etree/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"ETree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: ETree\nMetric: f1\n\n\nResults for ExtraTree:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9241\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.024s\n-------------------------------------\nExtraTree --> f1: 0.9241\n
                                                                                                                                                                                                                                          "}, {"location": "API/models/etree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                          "}, {"location": "API/models/etree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/etree/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                          "}, {"location": "API/models/etree/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                              "}, {"location": "API/models/etree/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                              "}, {"location": "API/models/ets/", "title": "ETS", "text": "

                                                                                                                                                                                                                                              ETS native multioutput

                                                                                                                                                                                                                                              The ETS models are a family of time series models with an underlying state space model consisting of a level component, a trend component (T), a seasonal component (S), and an error term (E).

                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                              • AutoETS for forecasting tasks.

                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                              ARIMA Autoregressive Integrated Moving Average Model.

                                                                                                                                                                                                                                              ExponentialSmoothing Exponential Smoothing forecaster.

                                                                                                                                                                                                                                              PolynomialTrend Polynomial Trend forecaster.

                                                                                                                                                                                                                                              "}, {"location": "API/models/ets/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"ETS\", verbose=2)\n\n\nTraining ========================= >>\nModels: ETS\nMetric: mape\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.022s\n-------------------------------------\nETS --> mape: -0.2305\n
                                                                                                                                                                                                                                              "}, {"location": "API/models/ets/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                              ParameterserrorCategoricalDistribution(choices=('add', 'mul'))trendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(1, 4, 6, 7, 12))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))maxiterIntDistribution(high=2000, log=False, low=500, step=100)autoCategoricalDistribution(choices=(True, False))information_criterionCategoricalDistribution(choices=('aic', 'bic', 'aicc'))

                                                                                                                                                                                                                                              "}, {"location": "API/models/ets/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ets/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                              "}, {"location": "API/models/ets/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                  "}, {"location": "API/models/ets/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                                                                                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                  "}, {"location": "API/models/gbm/", "title": "GradientBoostingMachine", "text": "

                                                                                                                                                                                                                                                  GBM accept sparse

                                                                                                                                                                                                                                                  A Gradient Boosting Machine builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the loss function, e.g. binary or multiclass log loss. Binary classification is a special case where only a single regression tree is induced.

                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                  • GradientBoostingClassifier for classification tasks.
                                                                                                                                                                                                                                                  • GradientBoostingRegressor for regression tasks.

                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                  HistGradientBoosting is a much faster variant of this algorithm for intermediate datasets (n_samples >= 10k).

                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                  CatBoost Cat Boosting Machine.

                                                                                                                                                                                                                                                  HistGradientBoosting Histogram-based Gradient Boosting Machine.

                                                                                                                                                                                                                                                  LightGBM Light Gradient Boosting Machine.

                                                                                                                                                                                                                                                  "}, {"location": "API/models/gbm/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GBM\nMetric: f1\n\n\nResults for GradientBoostingMachine:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9589\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 0.886s\n\n\nFinal results ==================== >>\nTotal time: 0.890s\n-------------------------------------\nGradientBoostingMachine --> f1: 0.9589\n
                                                                                                                                                                                                                                                  "}, {"location": "API/models/gbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('log_loss', 'exponential'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'huber', 'quantile'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)alphaFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)

                                                                                                                                                                                                                                                  "}, {"location": "API/models/gbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gbm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                  "}, {"location": "API/models/gbm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                      "}, {"location": "API/models/gbm/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                      "}, {"location": "API/models/gnb/", "title": "GaussianNB", "text": "

                                                                                                                                                                                                                                                      GNB supports acceleration

                                                                                                                                                                                                                                                      Gaussian Naive Bayes implements the Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.

                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                      • GaussianNB for classification tasks.

                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                      BernoulliNB Bernoulli Naive Bayes.

                                                                                                                                                                                                                                                      CategoricalNB Categorical Naive Bayes.

                                                                                                                                                                                                                                                      ComplementNB Complement Naive Bayes.

                                                                                                                                                                                                                                                      "}, {"location": "API/models/gnb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9553\nTest evaluation --> f1: 0.9371\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.022s\n-------------------------------------\nGaussianNB --> f1: 0.9371\n
                                                                                                                                                                                                                                                      "}, {"location": "API/models/gnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                      "}, {"location": "API/models/gnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                          "}, {"location": "API/models/gnb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                          "}, {"location": "API/models/gp/", "title": "GaussianProcess", "text": "

                                                                                                                                                                                                                                                          GP

                                                                                                                                                                                                                                                          Gaussian Processes are a generic supervised learning method designed to solve regression and probabilistic classification problems. The advantages of Gaussian processes are:

                                                                                                                                                                                                                                                          • The prediction interpolates the observations.
                                                                                                                                                                                                                                                          • The prediction is probabilistic (Gaussian) so that one can compute empirical confidence intervals and decide based on those if one should refit (online fitting, adaptive fitting) the prediction in some region of interest.

                                                                                                                                                                                                                                                          The disadvantages of Gaussian processes include:

                                                                                                                                                                                                                                                          • They are not sparse, i.e., they use the whole samples/features information to perform the prediction.
                                                                                                                                                                                                                                                          • They lose efficiency in high dimensional spaces, namely when the number of features exceeds a few dozens.

                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                          • GaussianProcessClassifier for classification tasks.
                                                                                                                                                                                                                                                          • GaussianProcessRegressor for regression tasks.

                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                          GaussianNB Gaussian Naive Bayes.

                                                                                                                                                                                                                                                          LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                                                                                                                          PassiveAggressive Passive Aggressive.

                                                                                                                                                                                                                                                          "}, {"location": "API/models/gp/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"GP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: GP\nMetric: f1\n\n\nResults for GaussianProcess:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9437\nTime elapsed: 0.105s\n-------------------------------------------------\nTime: 0.105s\n\n\nFinal results ==================== >>\nTotal time: 0.109s\n-------------------------------------\nGaussianProcess --> f1: 0.9437\n
                                                                                                                                                                                                                                                          "}, {"location": "API/models/gp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gp/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                          "}, {"location": "API/models/gp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                              "}, {"location": "API/models/gp/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                              "}, {"location": "API/models/hgbm/", "title": "HistGradientBoosting", "text": "

                                                                                                                                                                                                                                                              hGBM

                                                                                                                                                                                                                                                              This Histogram-based Gradient Boosting Machine is much faster than the standard GradientBoostingMachine for big datasets (n_samples>=10k). This variation first bins the input samples into integer-valued bins which tremendously reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees.

                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                              • HistGradientBoostingClassifier for classification tasks.
                                                                                                                                                                                                                                                              • HistGradientBoostingRegressor for regression tasks.

                                                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                              CatBoost Cat Boosting Machine.

                                                                                                                                                                                                                                                              GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                                                                                                              XGBoost Extreme Gradient Boosting.

                                                                                                                                                                                                                                                              "}, {"location": "API/models/hgbm/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"hGBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: hGBM\nMetric: f1\n\n\nResults for HistGradientBoosting:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.357s\n-------------------------------------------------\nTime: 0.357s\n\n\nFinal results ==================== >>\nTotal time: 0.360s\n-------------------------------------\nHistGradientBoosting --> f1: 0.9583\n
                                                                                                                                                                                                                                                              "}, {"location": "API/models/hgbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                              Parameterslearning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                              ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson', 'quantile', 'gamma'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                              "}, {"location": "API/models/hgbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/hgbm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                              "}, {"location": "API/models/hgbm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                  "}, {"location": "API/models/hgbm/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                  "}, {"location": "API/models/huber/", "title": "HuberRegression", "text": "

                                                                                                                                                                                                                                                                  Huber needs scaling

                                                                                                                                                                                                                                                                  Huber is a linear regression model that is robust to outliers. It makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect.

                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                  • HuberRegressor for regression tasks.

                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                  AutomaticRelevanceDetermination Automatic Relevance Determination.

                                                                                                                                                                                                                                                                  LeastAngleRegression Least Angle Regression.

                                                                                                                                                                                                                                                                  OrdinaryLeastSquares Linear Regression.

                                                                                                                                                                                                                                                                  "}, {"location": "API/models/huber/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Huber\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Huber\nMetric: r2\n\n\nResults for HuberRegression:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.546\nTest evaluation --> r2: 0.5999\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== >>\nTotal time: 0.187s\n-------------------------------------\nHuberRegression --> r2: 0.5999\n
                                                                                                                                                                                                                                                                  "}, {"location": "API/models/huber/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                  ParametersepsilonFloatDistribution(high=10.0, log=True, low=1.0, step=None)max_iterIntDistribution(high=500, log=False, low=50, step=10)alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                                                                                  "}, {"location": "API/models/huber/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/huber/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                  "}, {"location": "API/models/huber/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                      "}, {"location": "API/models/huber/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                      "}, {"location": "API/models/knn/", "title": "KNearestNeighbors", "text": "

                                                                                                                                                                                                                                                                      KNN needs scaling accept sparse native multilabel native multioutput supports acceleration

                                                                                                                                                                                                                                                                      K-Nearest Neighbors, as the name clearly indicates, implements the k-nearest neighbors vote. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.

                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                      • KNeighborsClassifier for classification tasks.
                                                                                                                                                                                                                                                                      • KNeighborsRegressor for classification tasks.

                                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                      LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                                                                                                                                      QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                                                                                                                                                                                                      RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                                                                                                                                                                                                      "}, {"location": "API/models/knn/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"KNN\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.981\nTest evaluation --> f1: 0.9793\nTime elapsed: 0.116s\n-------------------------------------------------\nTime: 0.116s\n\n\nFinal results ==================== >>\nTotal time: 0.119s\n-------------------------------------\nKNearestNeighbors --> f1: 0.9793\n
                                                                                                                                                                                                                                                                      "}, {"location": "API/models/knn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      sklearnsklearnexcuml

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                      "}, {"location": "API/models/knn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/knn/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                      "}, {"location": "API/models/knn/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                          "}, {"location": "API/models/knn/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lars/", "title": "LeastAngleRegression", "text": "

                                                                                                                                                                                                                                                                          Lars needs scaling

                                                                                                                                                                                                                                                                          Least-Angle Regression is a regression algorithm for high-dimensional data. Lars is similar to forward stepwise regression. At each step, it finds the feature most correlated with the target. When there are multiple features having equal correlation, instead of continuing along the same feature, it proceeds in a direction equiangular between the features.

                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                          • Lars for regression tasks.

                                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                          BayesianRidge Bayesian ridge regression.

                                                                                                                                                                                                                                                                          HuberRegression Huber regressor.

                                                                                                                                                                                                                                                                          OrdinaryLeastSquares Linear Regression.

                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lars/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Lars\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Lars\nMetric: r2\n\n\nResults for LeastAngleRegression:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== >>\nTotal time: 0.137s\n-------------------------------------\nLeastAngleRegression --> r2: 0.6028\n
                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lars/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lars/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lars/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lars/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lasso/", "title": "Lasso", "text": "

                                                                                                                                                                                                                                                                              Lasso needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                                                                              Linear least squares with l1 regularization.

                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                              • Lasso for regression tasks.

                                                                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                              ElasticNet Linear Regression with elasticnet regularization.

                                                                                                                                                                                                                                                                              OrdinaryLeastSquares Linear Regression.

                                                                                                                                                                                                                                                                              Ridge Linear least squares with l2 regularization.

                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lasso/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Lasso\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Lasso\nMetric: r2\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.0\nTest evaluation --> r2: -0.0001\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.139s\n-------------------------------------\nLasso --> r2: -0.0001 ~\n
                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lasso/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                                                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                                                                              cpugpu

                                                                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                                                                              ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))

                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lasso/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lasso/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lasso/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lasso/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lda/", "title": "LinearDiscriminantAnalysis", "text": "

                                                                                                                                                                                                                                                                                  LDA

                                                                                                                                                                                                                                                                                  Linear Discriminant Analysis is a classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.

                                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                                  • LinearDiscriminantAnalysis for classification tasks.

                                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                  LogisticRegression Logistic Regression.

                                                                                                                                                                                                                                                                                  RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                                                                                                                                                                                                                  QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lda/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"LDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9743\nTest evaluation --> f1: 0.9726\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nFinal results ==================== >>\nTotal time: 0.028s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.9726\n
                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lda/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                                  ParameterssolverCategoricalDistribution(choices=('svd', 'lsqr', 'eigen'))shrinkageCategoricalDistribution(choices=(None, 'auto', 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))

                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lda/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lda/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lda/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lgb/", "title": "LightGBM", "text": "

                                                                                                                                                                                                                                                                                      LGB needs scaling accept sparse allows validation supports acceleration

                                                                                                                                                                                                                                                                                      LightGBM is a gradient boosting model that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

                                                                                                                                                                                                                                                                                      • Faster training speed and higher efficiency.
                                                                                                                                                                                                                                                                                      • Lower memory usage.
                                                                                                                                                                                                                                                                                      • Better accuracy.
                                                                                                                                                                                                                                                                                      • Capable of handling large-scale data.

                                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                                      • LGBMClassifier for classification tasks.
                                                                                                                                                                                                                                                                                      • LGBMRegressor for regression tasks.

                                                                                                                                                                                                                                                                                      Read more in LightGBM's documentation.

                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                      Using LightGBM's GPU acceleration requires additional software dependencies.

                                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                                      CatBoost Cat Boosting Machine.

                                                                                                                                                                                                                                                                                      GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                                                                                                                                      XGBoost Extreme Gradient Boosting.

                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lgb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"LGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: LGB\nMetric: f1\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.426s\n-------------------------------------------------\nTime: 0.426s\n\n\nFinal results ==================== >>\nTotal time: 0.429s\n-------------------------------------\nLightGBM --> f1: 0.9583\n
                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lgb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/lgb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                        Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lgb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lr/", "title": "LogisticRegression", "text": "

                                                                                                                                                                                                                                                                                          LR needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                                                                                          Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.

                                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                                          • LogisticRegression for classification tasks.

                                                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                                          GaussianProcess Gaussian process.

                                                                                                                                                                                                                                                                                          LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                                                                                                                                                          PassiveAggressive Passive Aggressive.

                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lr/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9524\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== >>\nTotal time: 0.232s\n-------------------------------------\nRandomForest --> f1: 0.9524\n
                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lr/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                                                          cpugpu

                                                                                                                                                                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lr/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lr/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/lr/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lr/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lsvm/", "title": "LinearSVM", "text": "

                                                                                                                                                                                                                                                                                              lSVM needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                                                                                              Similar to SupportVectorMachine but with a linear kernel. Implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

                                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                                              • LinearSVC for classification tasks.
                                                                                                                                                                                                                                                                                              • LinearSVR for classification tasks.

                                                                                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                                              KNearestNeighbors K-Nearest Neighbors.

                                                                                                                                                                                                                                                                                              StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                                                                                                                                              SupportVectorMachine Support Vector Machine.

                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lsvm/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"lSVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: lSVM\nMetric: f1\n\n\nResults for LinearSVM:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.993\nTest evaluation --> f1: 0.9722\nTime elapsed: 0.089s\n-------------------------------------------------\nTime: 0.089s\n\n\nFinal results ==================== >>\nTotal time: 0.092s\n-------------------------------------\nLinearSVM --> f1: 0.9722\n
                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lsvm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearncuml

                                                                                                                                                                                                                                                                                              ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                              ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                              sklearncuml

                                                                                                                                                                                                                                                                                              ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                              ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lsvm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lsvm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/lsvm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/lsvm/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/mlp/", "title": "MultiLayerPerceptron", "text": "

                                                                                                                                                                                                                                                                                                  MLP needs scaling accept sparse native multilabel allows validation

                                                                                                                                                                                                                                                                                                  Multi-layer Perceptron is a supervised learning algorithm that learns a function by training on a dataset. Given a set of features and a target, it can learn a non-linear function approximator for either classification or regression. It is different from logistic regression, in that between the input and the output layer, there can be one or more non-linear layers, called hidden layers.

                                                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                                                  • MLPClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                  • MLPRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                  PassiveAggressive Passive Aggressive.

                                                                                                                                                                                                                                                                                                  Perceptron Linear Perceptron classification.

                                                                                                                                                                                                                                                                                                  StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/mlp/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"MLP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: MLP\nMetric: f1\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965\nTest evaluation --> f1: 0.979\nTime elapsed: 1.783s\n-------------------------------------------------\nTime: 1.783s\n\n\nFinal results ==================== >>\nTotal time: 1.786s\n-------------------------------------\nMultiLayerPerceptron --> f1: 0.979\n
                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/mlp/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                  Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)

                                                                                                                                                                                                                                                                                                  Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)

                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/mlp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mlp/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/mlp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                                    Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mlp/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mnb/", "title": "MultinomialNB", "text": "

                                                                                                                                                                                                                                                                                                      MNB accept sparse supports acceleration

                                                                                                                                                                                                                                                                                                      MultinomialNB implements the Naive Bayes algorithm for multinomially distributed data, and is one of the two classic Naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).

                                                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                                                      • MultinomialNB for classification tasks.

                                                                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                                                      BernoulliNB Bernoulli Naive Bayes.

                                                                                                                                                                                                                                                                                                      ComplementNB Complement Naive Bayes.

                                                                                                                                                                                                                                                                                                      GaussianNB Gaussian Naive Bayes.

                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mnb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"MNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: MNB\nMetric: f1\n\n\nResults for MultinomialNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9238\nTest evaluation --> f1: 0.9128\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== >>\nTotal time: 0.024s\n-------------------------------------\nMultinomialNB --> f1: 0.9128\n
                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml

                                                                                                                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                      ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mnb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/mnb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/mnb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/nf/", "title": "NaiveForecaster", "text": "

                                                                                                                                                                                                                                                                                                          NF native multioutput

                                                                                                                                                                                                                                                                                                          NaiveForecaster is a dummy forecaster that makes forecasts using simple strategies based on naive assumptions about past trends continuing. When used in multivariate tasks, each column is forecasted with the same strategy.

                                                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                                                          • NaiveForecaster for forecasting tasks.

                                                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                                                          ExponentialSmoothing Exponential Smoothing forecaster.

                                                                                                                                                                                                                                                                                                          Dummy Dummy classifier/regressor.

                                                                                                                                                                                                                                                                                                          PolynomialTrend Polynomial Trend forecaster.

                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/nf/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                          >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"NF\", verbose=2)\n\n\nTraining ========================= >>\nModels: NF\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0858\nTest evaluation --> mape: -0.2305\nTime elapsed: 0.022s\n-------------------------------------------------\nTime: 0.022s\n\n\nFinal results ==================== >>\nTotal time: 0.023s\n-------------------------------------\nNaiveForecaster --> mape: -0.2305\n
                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/nf/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                                                          ParametersstrategyCategoricalDistribution(choices=('last', 'mean', 'drift'))

                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/nf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/nf/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/nf/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/nf/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                              method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                              method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                              method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                                                                                                                                              method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                              method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                              method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                              method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                                                                                                                                              Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                                                                              X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                              fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/ols/", "title": "OrdinaryLeastSquares", "text": "

                                                                                                                                                                                                                                                                                                              OLS needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                                                                                                              Ordinary Least Squares is just linear regression without any regularization. It fits a linear model with coefficients w=(w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

                                                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                                                              • LinearRegression for regression tasks.

                                                                                                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                                                              ElasticNet Linear Regression with elasticnet regularization.

                                                                                                                                                                                                                                                                                                              Lasso Linear Regression with lasso regularization.

                                                                                                                                                                                                                                                                                                              Ridge Linear least squares with l2 regularization.

                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/ols/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                              >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"OLS\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: OLS\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.138s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.6028\n
                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/ols/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ols/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/ols/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/ols/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/omp/", "title": "OrthogonalMatchingPursuit", "text": "

                                                                                                                                                                                                                                                                                                                  OMP needs scaling

                                                                                                                                                                                                                                                                                                                  Orthogonal Matching Pursuit implements the OMP algorithm for approximating the fit of a linear model with constraints imposed on the number of non-zero coefficients.

                                                                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                  • OrthogonalMatchingPursuit for regression tasks.

                                                                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                  Lasso Linear Regression with lasso regularization.

                                                                                                                                                                                                                                                                                                                  LeastAngleRegression Least Angle Regression.

                                                                                                                                                                                                                                                                                                                  OrdinaryLeastSquares Linear Regression.

                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/omp/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"OMP\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: OMP\nMetric: r2\n\n\nResults for OrthogonalMatchingPursuit:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.4751\nTest evaluation --> r2: 0.4668\nTime elapsed: 0.135s\n-------------------------------------------------\nTime: 0.135s\n\n\nFinal results ==================== >>\nTotal time: 0.136s\n-------------------------------------\nOrthogonalMatchingPursuit --> r2: 0.4668\n
                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/omp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/omp/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/omp/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/omp/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/pa/", "title": "PassiveAggressive", "text": "

                                                                                                                                                                                                                                                                                                                      PA needs scaling accept sparse allows validation

                                                                                                                                                                                                                                                                                                                      The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter C.

                                                                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                      • PassiveAggressiveClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                      • PassiveAggressiveRegressor for classification tasks.

                                                                                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                                                                      MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                                                                                                                                                                      Perceptron Linear Perceptron classification.

                                                                                                                                                                                                                                                                                                                      StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/pa/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"PA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: PA\nMetric: f1\n\n\nResults for PassiveAggressive:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965\nTest evaluation --> f1: 0.9504\nTime elapsed: 5.512s\n-------------------------------------------------\nTime: 5.512s\n\n\nFinal results ==================== >>\nTotal time: 5.515s\n-------------------------------------\nPassiveAggressive --> f1: 0.9504\n
                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/pa/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/pa/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pa/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/pa/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                                                        Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/pa/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/perc/", "title": "Perceptron", "text": "

                                                                                                                                                                                                                                                                                                                          Perc needs scaling allows validation

                                                                                                                                                                                                                                                                                                                          The Perceptron is a simple classification algorithm suitable for large scale learning. By default:

                                                                                                                                                                                                                                                                                                                          • It does not require a learning rate.
                                                                                                                                                                                                                                                                                                                          • It is not regularized (penalized).
                                                                                                                                                                                                                                                                                                                          • It updates its model only on mistakes.

                                                                                                                                                                                                                                                                                                                          The last characteristic implies that the Perceptron is slightly faster to train than StochasticGradientDescent with the hinge loss and that the resulting models are sparser.

                                                                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                          • Perceptron for classification tasks.

                                                                                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                                                                          MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                                                                                                                                                                          PassiveAggressive Passive Aggressive.

                                                                                                                                                                                                                                                                                                                          StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/perc/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Perc\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Perc\nMetric: f1\n\n\nResults for Perceptron:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9577\nTime elapsed: 5.509s\n-------------------------------------------------\nTime: 5.509s\n\n\nFinal results ==================== >>\nTotal time: 5.512s\n-------------------------------------\nPerceptron --> f1: 0.9577\n
                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/perc/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                                                                          ParameterspenaltyCategoricalDistribution(choices=(None, 'l2', 'l1', 'elasticnet'))alphaFloatDistribution(high=10.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)

                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/perc/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/perc/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/perc/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                                                            Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/perc/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/pt/", "title": "PolynomialTrend", "text": "

                                                                                                                                                                                                                                                                                                                              PT native multioutput

                                                                                                                                                                                                                                                                                                                              Forecast time series data with a polynomial trend, using a sklearn LinearRegression class to regress values of time series on index, after extraction of polynomial features.

                                                                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                              • PolynomialTrendForecaster for forecasting tasks.

                                                                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                                                                              ARIMA Autoregressive Integrated Moving Average Model.

                                                                                                                                                                                                                                                                                                                              ETS ETS model with automatic fitting capabilities.

                                                                                                                                                                                                                                                                                                                              NaiveForecaster Naive Forecaster.

                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/pt/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                              >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.run(models=\"PT\", verbose=2)\n\n\nTraining ========================= >>\nModels: PT\nMetric: mape\n\n\nResults for PolynomialTrend:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.1196\nTest evaluation --> mape: -0.1181\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.019s\n-------------------------------------\nPolynomialTrend --> mape: -0.1181\n
                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/pt/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                                                                              ParametersdegreeIntDistribution(high=5, log=False, low=1, step=1)with_interceptCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/pt/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pt/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/pt/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/pt/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                  method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                  method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s).

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                  method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  marginal: bool, default=True Whether returned distribution is marginal by time index.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnssktime.proba.Normal Predicted distribution.

                                                                                                                                                                                                                                                                                                                                  method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                  method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to y.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                  method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                  method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.

                                                                                                                                                                                                                                                                                                                                  Parametersy: int, str, dict, sequence or dataframe Ground truth observations.

                                                                                                                                                                                                                                                                                                                                  X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to fh.

                                                                                                                                                                                                                                                                                                                                  fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at.

                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of y with respect to a ground truth.

                                                                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/qda/", "title": "QuadraticDiscriminantAnalysis", "text": "

                                                                                                                                                                                                                                                                                                                                  QDA

                                                                                                                                                                                                                                                                                                                                  Quadratic Discriminant Analysis is a classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.

                                                                                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                  • QuadraticDiscriminantAnalysis for classification tasks.

                                                                                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                  LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                                                                                                                                                                                                  LogisticRegression Logistic Regression.

                                                                                                                                                                                                                                                                                                                                  RadiusNearestNeighbors Radius Nearest Neighbors.

                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/qda/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"QDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: QDA\nMetric: f1\n\n\nResults for QuadraticDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9809\nTest evaluation --> f1: 0.9504\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== >>\nTotal time: 0.026s\n-------------------------------------\nQuadraticDiscriminantAnalysis --> f1: 0.9504\n
                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/qda/#hyperparameters", "title": "Hyperparameters", "text": "

                                                                                                                                                                                                                                                                                                                                  Parametersreg_paramFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)

                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/qda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/qda/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/qda/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/qda/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/rf/", "title": "RandomForest", "text": "

                                                                                                                                                                                                                                                                                                                                      RF accept sparse native multilabel native multioutput supports acceleration

                                                                                                                                                                                                                                                                                                                                      Random forests are an ensemble learning method that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random forests correct for decision trees' habit of overfitting to their training set.

                                                                                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                      • RandomForestClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                      • RandomForestRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                                                                      cuML's implementation of RandomForestClassifier only supports predictions on dtype float32. Convert all dtypes before calling atom's run method to avoid exceptions.

                                                                                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                                                                                      DecisionTree Single Decision Tree.

                                                                                                                                                                                                                                                                                                                                      ExtraTrees Extremely Randomized Trees.

                                                                                                                                                                                                                                                                                                                                      HistGradientBoosting Histogram-based Gradient Boosting Machine.

                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/rf/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9524\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 0.232s\n\n\nFinal results ==================== >>\nTotal time: 0.236s\n-------------------------------------\nRandomForest --> f1: 0.9524\n
                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/rf/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/rf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rf/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/rf/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/rf/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/ridge/", "title": "Ridge", "text": "

                                                                                                                                                                                                                                                                                                                                          Ridge needs scaling accept sparse native multilabel supports acceleration

                                                                                                                                                                                                                                                                                                                                          If classifier, it first converts the target values into {-1, 1} and then treats the problem as a regression task.

                                                                                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                          • RidgeClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                          • Ridge for regression tasks.

                                                                                                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                                                                          Engines sklearnex and cuml are only available for regression tasks.

                                                                                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                                                                                          BayesianRidge Bayesian ridge regression.

                                                                                                                                                                                                                                                                                                                                          ElasticNet Linear Regression with elasticnet regularization.

                                                                                                                                                                                                                                                                                                                                          Lasso Linear Regression with lasso regularization.

                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/ridge/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                          >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import fetch_california_housing\n\n>>> X, y = fetch_california_housing(return_X_y=True)\n\n>>> atom = ATOMRegressor(X, y, random_state=1)\n>>> atom.run(models=\"Ridge\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= >>\nModels: Ridge\nMetric: r2\n\n\nResults for Ridge:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6067\nTest evaluation --> r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== >>\nTotal time: 0.137s\n-------------------------------------\nRidge --> r2: 0.6028\n
                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/ridge/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          cpugpu

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          cpugpu

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))

                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/ridge/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ridge/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/ridge/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/ridge/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/rnn/", "title": "RadiusNearestNeighbors", "text": "

                                                                                                                                                                                                                                                                                                                                              RNN needs scaling accept sparse native multilabel native multioutput

                                                                                                                                                                                                                                                                                                                                              Radius Nearest Neighbors implements the nearest neighbors vote, where the neighbors are selected from within a given radius. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.

                                                                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                                                                              • The radius parameter should be tuned to the data at hand or the model will perform poorly.
                                                                                                                                                                                                                                                                                                                                              • If outliers are detected, the estimator raises an exception unless est_params={\"outlier_label\": \"most_frequent\"} is used.

                                                                                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                              • RadiusNeighborsClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                              • RadiusNeighborsRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                                                              Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                                                                                              KNearestNeighbors K-Nearest Neighbors.

                                                                                                                                                                                                                                                                                                                                              LinearDiscriminantAnalysis Linear Discriminant Analysis.

                                                                                                                                                                                                                                                                                                                                              QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.

                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/rnn/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RNN\",\n...     metric=\"f1\",\n...     est_params={\"outlier_label\": \"most_frequent\"},\n...     verbose=2,\n... )\n\n\nTraining ========================= >>\nModels: RNN\nMetric: f1\n\n\nResults for RadiusNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.7717\nTime elapsed: 0.091s\n-------------------------------------------------\nTime: 0.091s\n\n\nFinal results ==================== >>\nTotal time: 0.094s\n-------------------------------------\nRadiusNearestNeighbors --> f1: 0.7717 ~\n
                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/rnn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                                                              ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                                                                                              ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)

                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/rnn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rnn/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/rnn/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/rnn/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/sgd/", "title": "StochasticGradientDescent", "text": "

                                                                                                                                                                                                                                                                                                                                                  SGD needs scaling accept sparse allows validation

                                                                                                                                                                                                                                                                                                                                                  Stochastic Gradient Descent is a simple yet very efficient approach to fitting linear classifiers and regressors under convex loss functions. Even though SGD has been around in the machine learning community for a long time, it has received a considerable amount of attention just recently in the context of large-scale learning.

                                                                                                                                                                                                                                                                                                                                                  Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                                  • SGDClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                                  • SGDRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                                                                  Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                  MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                                                                                                                                                                                                  PassiveAggressive Passive Aggressive.

                                                                                                                                                                                                                                                                                                                                                  SupportVectorMachine Support Vector Machine.

                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/sgd/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"SGD\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: SGD\nMetric: f1\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9948\nTest evaluation --> f1: 0.9722\nTime elapsed: 5.506s\n-------------------------------------------------\nTime: 5.506s\n\n\nFinal results ==================== >>\nTotal time: 5.509s\n-------------------------------------\nStochasticGradientDescent --> f1: 0.9722\n
                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/sgd/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                  ParameterslossCategoricalDistribution(choices=('squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/sgd/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/sgd/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                  Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                                  Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                  Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                                  mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                                  The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/sgd/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                  Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                                  Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                                  This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                                  This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                                  • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                                  • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                                  • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                                  • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                                  • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                  • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                                    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                                    This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                                                                                    Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                                    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                                    All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                    • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                    • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                    • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                    • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                    • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                    • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                    • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                    • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                                      The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/sgd/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                      The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                                      bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                      method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                                      Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                                      Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                      method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                                      Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                                      method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                      This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                      Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                      cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                      horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                      vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                      title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                      • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                      • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                      • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                      legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                      • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                      • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                      • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                      figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                      display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                      Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                      method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                                      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                      • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                      • Cached predictions.
                                                                                                                                                                                                                                                                                                                                                      • Shap values
                                                                                                                                                                                                                                                                                                                                                      • App instance
                                                                                                                                                                                                                                                                                                                                                      • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                      • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                      method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                                      Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                                      method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                                      ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                                      By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      Note

                                                                                                                                                                                                                                                                                                                                                      Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                                      filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                                      **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                                      method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                                      This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                                      Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                                      method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                                      method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                                      Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                                      Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                                      rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                      threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                      • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                      • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                      • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                      For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                      Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                                      method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                                      The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                                      ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                      method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                                      The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                                      y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                                      method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                                      In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                                      Warning

                                                                                                                                                                                                                                                                                                                                                      Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                                      Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                                      method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                                      Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                      Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                                      Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                      method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                                      Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                                      Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                      reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                      method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                      series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                      method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                                      method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                                      This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                                      Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                                      stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                                      archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                                      method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                      method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                                      Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                      method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                      New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                                      Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                      Info

                                                                                                                                                                                                                                                                                                                                                      If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                      • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                      sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                                      method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                                      The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                                      Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                                      Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                                      host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                                      port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                                      method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                      Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                                      ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                      y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                      • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                      • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                      • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                      • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                      • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                      verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                      Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                      series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                      method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                      method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                      Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                      Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/svm/", "title": "SupportVectorMachine", "text": "

                                                                                                                                                                                                                                                                                                                                                      SVM needs scaling accept sparse supports acceleration

                                                                                                                                                                                                                                                                                                                                                      The implementation of the Support Vector Machine is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using a LinearSVM or a StochasticGradientDescent model instead.

                                                                                                                                                                                                                                                                                                                                                      Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                                      • SVC for classification tasks.
                                                                                                                                                                                                                                                                                                                                                      • SVR for classification tasks.

                                                                                                                                                                                                                                                                                                                                                      Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                      See Also

                                                                                                                                                                                                                                                                                                                                                      LinearSVM Linear Support Vector Machine.

                                                                                                                                                                                                                                                                                                                                                      MultiLayerPerceptron Multi-layer Perceptron.

                                                                                                                                                                                                                                                                                                                                                      StochasticGradientDescent Stochastic Gradient Descent.

                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/svm/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                      >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"SVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: SVM\nMetric: f1\n\n\nResults for SupportVectorMachine:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.979\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== >>\nTotal time: 0.098s\n-------------------------------------\nSupportVectorMachine --> f1: 0.979\n
                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/svm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      sklearnsklearnexcuml

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      cpugpu

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))

                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/svm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/svm/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                      Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                                      Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                                      Tip

                                                                                                                                                                                                                                                                                                                                                      Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                                      mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                                      The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                      "}, {"location": "API/models/svm/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                      Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                                      Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                                      This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                                      This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                                      • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                                      • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                                      • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                                      • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                                      • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                                      • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                      • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                                        For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                                        This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                                        The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                                        All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                        • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                        • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                        • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                        • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                        • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                        • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                        • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                        • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                                          The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/svm/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                          The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                                          bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                          method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                                          Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                                          Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                          method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                                          Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                                          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                          Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                          cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                          horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                          vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                          title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                          • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                          • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                          • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                          legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                          • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                          • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                          • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                          figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                          display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                          Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                          method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                                          Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                          • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                          • Cached predictions.
                                                                                                                                                                                                                                                                                                                                                          • Shap values
                                                                                                                                                                                                                                                                                                                                                          • App instance
                                                                                                                                                                                                                                                                                                                                                          • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                          • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                          method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                                          Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                                          method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                                          ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                                          By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          Note

                                                                                                                                                                                                                                                                                                                                                          Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                                          filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                                          **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                                          method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                                          This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                                          Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                                          method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                                          method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                                          Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                                          Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                                          rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                          threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                          • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                          • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                          • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                          For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                          Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                                          method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                                          The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                                          ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                          method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                                          The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                                          y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                                          method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                                          In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                                          Warning

                                                                                                                                                                                                                                                                                                                                                          Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                                          Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                                          method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                                          Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                          Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                                          Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                          method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                                          Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                                          Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                          reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                          method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                          series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                          method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                                          method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                                          This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                                          Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                                          stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                                          archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                                          method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                          method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                                          Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                          method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                          New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                                          Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                          Info

                                                                                                                                                                                                                                                                                                                                                          If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                          • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                          sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                                          method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                                          The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                                          Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                                          Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                                          host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                                          port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                                          method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                          Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                                          ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                          y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                          • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                          • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                          • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                          • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                          • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                          verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                          Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                          series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                          method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                          method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                          Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                          Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/tree/", "title": "DecisionTree", "text": "

                                                                                                                                                                                                                                                                                                                                                          Tree accept sparse native multilabel native multioutput

                                                                                                                                                                                                                                                                                                                                                          A single decision tree classifier/regressor.

                                                                                                                                                                                                                                                                                                                                                          Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                                          • DecisionTreeClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                                          • DecisionTreeRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                                                                          Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                          See Also

                                                                                                                                                                                                                                                                                                                                                          ExtraTree Extremely Randomized Tree.

                                                                                                                                                                                                                                                                                                                                                          ExtraTrees Extremely Randomized Trees.

                                                                                                                                                                                                                                                                                                                                                          RandomForest Random Forest.

                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/tree/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                          >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"Tree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: Tree\nMetric: f1\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9589\nTime elapsed: 0.032s\n-------------------------------------------------\nTime: 0.032s\n\n\nFinal results ==================== >>\nTotal time: 0.035s\n-------------------------------------\nDecisionTree --> f1: 0.9589\n
                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/tree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                                          ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'friedman_mse', 'poisson'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)

                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/tree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/tree/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                          Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                                          Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                                          Tip

                                                                                                                                                                                                                                                                                                                                                          Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                                          mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                                          The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                          "}, {"location": "API/models/tree/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                          Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                                          Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                                          This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                                          This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                                          • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                                          • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                                          • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                                          • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                                          • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                                          • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                          • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                                            For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                                            This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                                            The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                                            All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                            • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                            • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                            • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                            • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                            • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                            • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                            • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                            • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                                              The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/tree/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                              The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                                              bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                              method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                                              Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                                              Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                              method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                                              Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                                              method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                              This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                              Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                              cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                              horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                              vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                              title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                              • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                              • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                              • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                              legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                              • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                              • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                              • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                              figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                              display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                              Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                              method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                                              Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                              • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                              • Cached predictions.
                                                                                                                                                                                                                                                                                                                                                              • Shap values
                                                                                                                                                                                                                                                                                                                                                              • App instance
                                                                                                                                                                                                                                                                                                                                                              • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                              • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                              method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                                              Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                                              method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                                              ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                                              By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              Note

                                                                                                                                                                                                                                                                                                                                                              Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                                              filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                                              **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                                              method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                                              This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                                              Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                                              method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                                              method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                                              Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                                              Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                                              rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                              threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                              • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                              • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                              • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                              For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                              Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                                              method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                                              The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                                              ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                              method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                                              The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                                              y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                                              method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                                              In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                                              Warning

                                                                                                                                                                                                                                                                                                                                                              Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                                              Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                                              method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                                              Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                              Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                                              Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                              method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                                              Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                                              Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                              reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                              method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                              series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                              method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                                              method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                                              This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                                              Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                                              stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                                              archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                                              method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                              method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                                              Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                              method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                              New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                                              Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                              Info

                                                                                                                                                                                                                                                                                                                                                              If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                              • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                              sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                                              method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                                              The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                                              Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                                              Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                                              host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                                              port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                                              method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                              Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                                              ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                              y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                              • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                              • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                              • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                              • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                              • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                              verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                              Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                              series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                              method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                              method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                              Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                              Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/xgb/", "title": "XGBoost", "text": "

                                                                                                                                                                                                                                                                                                                                                              XGB needs scaling accept sparse allows validation supports acceleration

                                                                                                                                                                                                                                                                                                                                                              XGBoost is an optimized distributed gradient boosting model designed to be highly efficient, flexible and portable. XGBoost provides a parallel tree boosting that solve many data science problems in a fast and accurate way.

                                                                                                                                                                                                                                                                                                                                                              Corresponding estimators are:

                                                                                                                                                                                                                                                                                                                                                              • XGBClassifier for classification tasks.
                                                                                                                                                                                                                                                                                                                                                              • XGBRegressor for regression tasks.

                                                                                                                                                                                                                                                                                                                                                              Read more in XGBoost's documentation.

                                                                                                                                                                                                                                                                                                                                                              See Also

                                                                                                                                                                                                                                                                                                                                                              CatBoost Cat Boosting Machine.

                                                                                                                                                                                                                                                                                                                                                              GradientBoostingMachine Gradient Boosting Machine.

                                                                                                                                                                                                                                                                                                                                                              LightGBM Light Gradient Boosting Machine.

                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/xgb/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                              >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(models=\"XGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= >>\nModels: XGB\nMetric: f1\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9583\nTime elapsed: 0.401s\n-------------------------------------------------\nTime: 0.401s\n\n\nFinal results ==================== >>\nTotal time: 0.404s\n-------------------------------------\nXGBoost --> f1: 0.9583\n
                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/xgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

                                                                                                                                                                                                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                                                                                                                                                                              Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)

                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/xgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/xgb/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                              Attributespipeline: PipelinePipeline of transforms.

                                                                                                                                                                                                                                                                                                                                                              Models that used automated feature scaling have the scaler added.

                                                                                                                                                                                                                                                                                                                                                              Tip

                                                                                                                                                                                                                                                                                                                                                              Use the plot_pipeline method to visualize the pipeline.

                                                                                                                                                                                                                                                                                                                                                              mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values.

                                                                                                                                                                                                                                                                                                                                                              The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                              "}, {"location": "API/models/xgb/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                              Attributesname: strName of the model.

                                                                                                                                                                                                                                                                                                                                                              Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model.

                                                                                                                                                                                                                                                                                                                                                              This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results.

                                                                                                                                                                                                                                                                                                                                                              This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:

                                                                                                                                                                                                                                                                                                                                                              • [param_name]: Parameter value used in this trial.
                                                                                                                                                                                                                                                                                                                                                              • estimator: Estimator used in this trial.
                                                                                                                                                                                                                                                                                                                                                              • [metric_name]: Metric score of the trial.
                                                                                                                                                                                                                                                                                                                                                              • [best_metric_name]: Best score so far in this study.
                                                                                                                                                                                                                                                                                                                                                              • time_trial: Duration of the trial.
                                                                                                                                                                                                                                                                                                                                                              • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                              • state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score.

                                                                                                                                                                                                                                                                                                                                                                For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial.

                                                                                                                                                                                                                                                                                                                                                                This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training.

                                                                                                                                                                                                                                                                                                                                                                Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores.

                                                                                                                                                                                                                                                                                                                                                                The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as [metric]_bootstrap. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results.

                                                                                                                                                                                                                                                                                                                                                                All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                • time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores.

                                                                                                                                                                                                                                                                                                                                                                  The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. This property is only available for estimators with at least one of those attributes.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/models/xgb/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plots can be called directly from the model. The remaining utility methods can be found hereunder.

                                                                                                                                                                                                                                                                                                                                                                  bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.

                                                                                                                                                                                                                                                                                                                                                                  Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

                                                                                                                                                                                                                                                                                                                                                                  Parametersn_bootstrap: int umber of bootstrapped samples to fit on.

                                                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new run or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                                  method calibrate(**kwargs)[source]Calibrate the model.

                                                                                                                                                                                                                                                                                                                                                                  Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. If there is an active mlflow experiment, a new run is started using the name [model_name]_calibrate. Since the estimator changed, the model is cleared. Only for classifiers.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from the model.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Cached predictions.
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method create_app(**kwargs)[source]Create an interactive app to test model predictions.

                                                                                                                                                                                                                                                                                                                                                                  Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the app attribute.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method.

                                                                                                                                                                                                                                                                                                                                                                  method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.

                                                                                                                                                                                                                                                                                                                                                                  ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.

                                                                                                                                                                                                                                                                                                                                                                  By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the mode=\"inline\" parameter. The created ExplainerDashboard instance can be accessed through the dashboard attribute. This method is not available for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the ExplainerDashboard instance.

                                                                                                                                                                                                                                                                                                                                                                  method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.

                                                                                                                                                                                                                                                                                                                                                                  This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Overview of the results.

                                                                                                                                                                                                                                                                                                                                                                  method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the threshold parameter.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.Series Scores of the model.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline()[source]Export the transformer pipeline with final estimator.

                                                                                                                                                                                                                                                                                                                                                                  The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None)[source]Fit and validate the model.

                                                                                                                                                                                                                                                                                                                                                                  The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, self.X_train is used.

                                                                                                                                                                                                                                                                                                                                                                  y: series, dataframe or None Target column corresponding to `X`. If None, self.y_train is used.

                                                                                                                                                                                                                                                                                                                                                                  method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.

                                                                                                                                                                                                                                                                                                                                                                  In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the estimator attribute. If there is an active mlflow experiment, a new run is started with the name [model_name]_full_train. Since the estimator changed, the model is cleared.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: pipeline = atom.export_pipeline().fit(atom.X, atom.y).

                                                                                                                                                                                                                                                                                                                                                                  Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set.

                                                                                                                                                                                                                                                                                                                                                                  method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.

                                                                                                                                                                                                                                                                                                                                                                  Only available for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold.

                                                                                                                                                                                                                                                                                                                                                                  Returnsfloat or list Best threshold or list of thresholds for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.

                                                                                                                                                                                                                                                                                                                                                                  Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.

                                                                                                                                                                                                                                                                                                                                                                  Parametersn_trials: int Number of trials for the hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  reset: bool, default=False Whether to start a new study or continue the existing one.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Original feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Original target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method predict(X, verbose=None)[source]Get predictions on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes).

                                                                                                                                                                                                                                                                                                                                                                  method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.

                                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.

                                                                                                                                                                                                                                                                                                                                                                  This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

                                                                                                                                                                                                                                                                                                                                                                  Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created.

                                                                                                                                                                                                                                                                                                                                                                  stage: str, default=\"None\" New desired stage for the model.

                                                                                                                                                                                                                                                                                                                                                                  archive_existing_versions: bool, default=False Whether all existing model versions in the stage will be moved to the \"Archived\" stage. Only valid when stage is \"Staging\" or \"Production\", otherwise an error will be raised.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                                  New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  If the metric parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: X must be a selection of rows in the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred) -> score or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsfloat Metric score of X with respect to y.

                                                                                                                                                                                                                                                                                                                                                                  method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.

                                                                                                                                                                                                                                                                                                                                                                  The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get(\"http://127.0.0.1:8000/\", json=X.to_json()). The deployment is done on a ray cluster. The default host and port parameters deploy to localhost.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use import ray; ray.serve.shutdown() to close the endpoint after finishing.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmethod: str, default=\"predict\" Estimator's method to do inference on.

                                                                                                                                                                                                                                                                                                                                                                  host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\".

                                                                                                                                                                                                                                                                                                                                                                  port: int, default=8000 Port for HTTP server.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/", "title": "TextCleaner", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]Applies standard text cleaning to the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

                                                                                                                                                                                                                                                                                                                                                                  This class can be accessed from atom through the textclean method. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdecode: bool, default=True Whether to decode unicode characters to their ascii representations.

                                                                                                                                                                                                                                                                                                                                                                  lower_case: bool, default=True Whether to convert all characters to lower case.

                                                                                                                                                                                                                                                                                                                                                                  drop_email: bool, default=True Whether to drop email addresses from the text.

                                                                                                                                                                                                                                                                                                                                                                  regex_email: str, default=None Regex used to search for email addresses. If None, it uses r\"[\\w.-]+@[\\w-]+\\.[\\w.-]+\".

                                                                                                                                                                                                                                                                                                                                                                  drop_url: bool, default=True Whether to drop URL links from the text.

                                                                                                                                                                                                                                                                                                                                                                  regex_url: str, default=None Regex used to search for URLs. If None, it uses r\"https?://\\S+|www\\.\\S+\".

                                                                                                                                                                                                                                                                                                                                                                  drop_html: bool, default=True Whether to drop HTML tags from the text. This option is particularly useful if the data was scraped from a website.

                                                                                                                                                                                                                                                                                                                                                                  regex_html: str, default=None Regex used to search for html tags. If None, it uses r\"<.*?>\".

                                                                                                                                                                                                                                                                                                                                                                  drop_emoji: bool, default=True Whether to drop emojis from the text.

                                                                                                                                                                                                                                                                                                                                                                  regex_emoji: str, default=None Regex used to search for emojis. If None, it uses r\":[a-z_]+:\".

                                                                                                                                                                                                                                                                                                                                                                  drop_number: bool, default=True Whether to drop numbers from the text.

                                                                                                                                                                                                                                                                                                                                                                  regex_number: str, default=None Regex used to search for numbers. If None, it uses r\"\\b\\d+\\b\". Note that numbers adjacent to letters are not removed.

                                                                                                                                                                                                                                                                                                                                                                  drop_punctuation: bool, default=True Whether to drop punctuations from the text. Characters considered punctuation are !\"#$%&'()*+,-./:;<=>?@[\\]^_~`.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> print(atom.dataset)\n\n                                                 corpus  target\n0     From: fabian@vivian.w.open.de (Fabian Hoppe)\\n...       1\n1     From: nyeda@cnsvax.uwec.edu (David Nye)\\nSubje...       0\n2     From: urathi@net4.ICS.UCI.EDU (Unmesh Rathi)\\n...       1\n3     From: inoue@crd.yokogawa.co.jp (Inoue Takeshi)...       1\n4     From: sandvik@newton.apple.com (Kent Sandvik)\\...       0\n...                                                 ...     ...\n1662  From: kutluk@ccl.umist.ac.uk (Kutluk Ozguven)\\...       0\n1663  From: dmp1@ukc.ac.uk (D.M.Procida)\\nSubject: R...       2\n1664  From: tdunbar@vtaix.cc.vt.edu (Thomas Dunbar)\\...       1\n1665  From: dmp@fig.citib.com (Donna M. Paino)\\nSubj...       2\n1666  From: cdm@pmafire.inel.gov (Dale Cook)\\nSubjec...       2\n\n[1667 rows x 2 columns]\n\n\n>>> atom.textclean(verbose=2)\n\nFitting TextCleaner...\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n\n\n>>> print(atom.dataset)\n\n                                                 corpus  target\n0     from  fabian hoppe\\nsubject searching cadsoftw...       1\n1     from  david nye\\nsubject re after  years can w...       0\n2     from  unmesh rathi\\nsubject motif and intervie...       1\n3     from  inoue takeshi\\nsubject how to see charac...       1\n4     from  kent sandvik\\nsubject re slavery was re ...       0\n...                                                 ...     ...\n1662  from  kutluk ozguven\\nsubject re jewish settle...       0\n1663  from  dmprocida\\nsubject re homeopathy a respe...       2\n1664  from  thomas dunbar\\nsubject re x toolkits\\nsu...       1\n1665  from  donna m paino\\nsubject psoriatic arthrit...       2\n1666  from  dale cook\\nsubject re morbus meniere  is...       2\n\n[1667 rows x 2 columns]\n
                                                                                                                                                                                                                                                                                                                                                                  >>> import numpy as np\n>>> from atom.nlp import TextCleaner\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> textcleaner = TextCleaner(verbose=2)\n>>> X = textcleaner.transform(X)\n\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n\n\n>>> print(X)\n\n                                                 corpus\n0     from  mark a deloura\\nsubject looking for x wi...\n1     from  der mouse\\nsubject re creating  bit wind...\n2     from  keith m ryan\\nsubject re where are they ...\n3     from  steven grimm\\nsubject re opinions on all...\n4     from  peter kaminski\\nsubject re krillean phot...\n...                                                 ...\n1662  from donald mackie \\nsubject re seeking advice...\n1663  from  gordon banks\\nsubject re update help was...\n1664  from  keith m ryan\\nsubject re political athei...\n1665  from  benedikt rosenau\\nsubject re biblical ra...\n1666  from derrick j brashear \\nsubject mouseless op...\n\n[1667 rows x 1 columns]\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textcleaner/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X, y=None)[source]Apply the transformations to the data.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/", "title": "TextNormalizer", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]Normalize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

                                                                                                                                                                                                                                                                                                                                                                  This class can be accessed from atom through the textnormalize method. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersstopwords: bool or str, default=True Whether to remove a predefined dictionary of stopwords.

                                                                                                                                                                                                                                                                                                                                                                  • If False: Don't remove any predefined stopwords.
                                                                                                                                                                                                                                                                                                                                                                  • If True: Drop predefined english stopwords from the text.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Language from nltk.corpus.stopwords.words.

                                                                                                                                                                                                                                                                                                                                                                  custom_stopwords: sequence or None, default=None Custom stopwords to remove from the text.

                                                                                                                                                                                                                                                                                                                                                                  stem: bool or str, default=False Whether to apply stemming using SnowballStemmer.

                                                                                                                                                                                                                                                                                                                                                                  • If False: Don't apply stemming.
                                                                                                                                                                                                                                                                                                                                                                  • If True: Apply stemmer based on the english language.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Language from SnowballStemmer.languages.

                                                                                                                                                                                                                                                                                                                                                                  lemmatize: bool, default=True Whether to apply lemmatization using WordNetLemmatizer.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  Attributesfeature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.textnormalize(stopwords=\"english\", lemmatize=True, verbose=2)\n\nFitting TextNormalizer...\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n\n\n>>> print(atom.dataset)\n\n                           corpus  target\n0                     [new, york]       0\n1              [another, line...]       1\n2               [New, york, nice]       0\n3  [new, york, large, washington]       1\n4                     [run, test]       0\n5             [I, \u00e0m, ne'w, york]       1\n6                          [test]       0\n7                     [hi, test!]       1\n
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.nlp import TextNormalizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> textnormalizer = TextNormalizer(\n...     stopwords=\"english\",\n...     lemmatize=True,\n...     verbose=2,\n... )\n>>> X = textnormalizer.transform(X)\n\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n\n\n>>> print(X)\n\n                           corpus\n0             [I, \u00e0m, ne'w, york]\n1               [New, york, nice]\n2                     [new, york]\n3                     [hi, test!]\n4              [another, line...]\n5  [new, york, large, washington]\n6                     [run, test]\n7                          [test]\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/textnormalizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformNormalize the text.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X, y=None)[source]Normalize the text.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/", "title": "Tokenizer", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]Tokenize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                                                                                                                                                                                                  This class can be accessed from atom through the tokenize method. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersbigram_freq: int, float or None, default=None Frequency threshold for bigram creation.

                                                                                                                                                                                                                                                                                                                                                                  • If None: Don't create any bigrams.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Minimum number of occurrences to make a bigram.
                                                                                                                                                                                                                                                                                                                                                                  • If float: Minimum frequency fraction to make a bigram.

                                                                                                                                                                                                                                                                                                                                                                  trigram_freq: int, float or None, default=None Frequency threshold for trigram creation.

                                                                                                                                                                                                                                                                                                                                                                  • If None: Don't create any trigrams.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Minimum number of occurrences to make a trigram.
                                                                                                                                                                                                                                                                                                                                                                  • If float: Minimum frequency fraction to make a trigram.

                                                                                                                                                                                                                                                                                                                                                                  quadgram_freq: int, float or None, default=None Frequency threshold for quadgram creation.

                                                                                                                                                                                                                                                                                                                                                                  • If None: Don't create any quadgrams.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Minimum number of occurrences to make a quadgram.
                                                                                                                                                                                                                                                                                                                                                                  • If float: Minimum frequency fraction to make a quadgram.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  Attributesbigrams_: pd.DataFrame Created bigrams and their frequencies.

                                                                                                                                                                                                                                                                                                                                                                  trigrams_: pd.DataFrame Created trigrams and their frequencies.

                                                                                                                                                                                                                                                                                                                                                                  quadgrams_: pd.DataFrame Created quadgrams and their frequencies.

                                                                                                                                                                                                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Vectorizer Vectorize text data.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.tokenize(verbose=2)\n\nFitting Tokenizer...\nTokenizing the corpus...\n\n\n>>> print(atom.dataset)\n\n                                      corpus  target\n0                                [new, york]       0\n1                       [another, line, ...]       1\n2                      [New, york, is, nice]       0\n3  [new, york, is, larger, than, washington]       1\n4                       [running, the, test]       0\n5                [I, \u00e0m, in, ne, ', w, york]       1\n6                        [this, is, a, test]       0\n7          [hi, there, this, is, a, test, !]       1\n
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.nlp import Tokenizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> tokenizer = Tokenizer(bigram_freq=2, verbose=2)\n>>> X = tokenizer.transform(X)\n\nTokenizing the corpus...\n --> Creating 5 bigrams on 10 locations.\n\n\n>>> print(X)\n\n                                     corpus\n0               [I, \u00e0m, in, ne, ', w, york]\n1                      [New, york_is, nice]\n2                                [new_york]\n3           [hi, there, this_is, a_test, !]\n4                      [another, line, ...]\n5  [new, york_is, larger, than, washington]\n6                      [running, the, test]\n7                         [this_is, a_test]\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/tokenizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTokenize the text.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X, y=None)[source]Tokenize the text.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/", "title": "Vectorizer", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.nlp.Vectorizer(strategy=\"bow\", return_sparse=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Vectorize text data.

                                                                                                                                                                                                                                                                                                                                                                  Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                                                                                                                                                                                                  If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

                                                                                                                                                                                                                                                                                                                                                                  This class can be accessed from atom through the vectorize method. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersstrategy: str, default=\"bow\" Strategy with which to vectorize the text. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"bow\": Bag of Words.
                                                                                                                                                                                                                                                                                                                                                                  • \"tfidf\": Term Frequency - Inverse Document Frequency.
                                                                                                                                                                                                                                                                                                                                                                  • \"hashing\": Vectorize to a matrix of token occurrences.

                                                                                                                                                                                                                                                                                                                                                                  return_sparse: bool, default=True Whether to return the transformation output as a dataframe of sparse arrays. Must be False when there are other columns in X (besides corpus) that are non-sparse.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic naming.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the strategy estimator.

                                                                                                                                                                                                                                                                                                                                                                  Attributes[strategy]_: sklearn transformer Estimator instance (lowercase strategy) used to vectorize the corpus, e.g., vectorizer.tfidf for the tfidf strategy.

                                                                                                                                                                                                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  n_features_in_: int Number of features seen during fit.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  TextCleaner Applies standard text cleaning to the corpus.

                                                                                                                                                                                                                                                                                                                                                                  TextNormalizer Normalize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Tokenizer Tokenize the corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/#example", "title": "Example", "text": "atomstand-alone
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n>>> y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n>>> atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n>>> print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n>>> atom.vectorize(strategy=\"tfidf\", verbose=2)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n>>> print(atom.dataset)\n\n   corpus_another  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_washington  corpus_york  corpus_\u00e0m  target\n0        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.759339     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.650696   0.000000       0\n1        0.707107   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.000000   0.000000       1\n2        0.000000   0.000000   0.518242       0.000000     0.000000   0.000000    0.437535     0.631991         0.00000     0.000000     0.000000     0.00000           0.000000     0.374934   0.000000       0\n3        0.000000   0.000000   0.386401       0.471212     0.000000   0.000000    0.326226     0.000000         0.00000     0.000000     0.471212     0.00000           0.471212     0.279551   0.000000       1\n4        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000         0.57735     0.577350     0.000000     0.57735           0.000000     0.000000   0.000000       0\n5        0.000000   0.546199   0.000000       0.000000     0.000000   0.546199    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.324037   0.546199       1\n6        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       0\n7        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       1\n
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.nlp import Vectorizer\n\n>>> X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n>>> vectorizer = Vectorizer(strategy=\"tfidf\", verbose=2)\n>>> X = vectorizer.fit_transform(X)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n>>> print(X)\n\n   corpus_another  corpus_hi  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_there  corpus_this  corpus_washington  corpus_york  corpus_\u00e0m\n0        0.000000   0.000000   0.542162   0.000000       0.000000     0.000000   0.542162    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.343774   0.542162\n1        0.000000   0.000000   0.000000   0.415657       0.000000     0.000000   0.000000    0.474072     0.655527        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.415657   0.000000\n2        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.751913     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.659262   0.000000\n3        0.000000   0.525049   0.000000   0.332923       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.379712     0.000000    0.000000      0.525049     0.440032           0.000000     0.000000   0.000000\n4        0.707107   0.000000   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.000000   0.000000\n5        0.000000   0.000000   0.000000   0.304821       0.480729     0.000000   0.000000    0.347660     0.000000        0.000000     0.000000     0.480729    0.000000      0.000000     0.000000           0.480729     0.304821   0.000000\n6        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000        0.629565     0.455297     0.000000    0.629565      0.000000     0.000000           0.000000     0.000000   0.000000\n7        0.000000   0.000000   0.000000   0.497041       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.566893     0.000000    0.000000      0.000000     0.656949           0.000000     0.000000   0.000000\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/nlp/vectorizer/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformVectorize the text.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X, y=None)[source]Fit to data.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsSelf Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Do nothing.

                                                                                                                                                                                                                                                                                                                                                                  Returns the input unchanged. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe-like: Target columns with shape=(n_samples, n_targets) for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X, y=None)[source]Vectorize the text.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/", "title": "Pipeline", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.pipeline.Pipeline(steps, memory=None, verbose=0)[source]Pipeline of transforms with a final estimator.

                                                                                                                                                                                                                                                                                                                                                                  Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be transformsers, that is, they must implement fit and transform methods. The final estimator only needs to implement fit. The transformers in the pipeline can be cached using the memory parameter.

                                                                                                                                                                                                                                                                                                                                                                  The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by __, as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to passthrough or None.

                                                                                                                                                                                                                                                                                                                                                                  Read more in sklearn's the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  This class behaves similarly to sklearn's pipeline, and additionally:

                                                                                                                                                                                                                                                                                                                                                                  • Works with an empty pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • Accepts transformers that drop rows.
                                                                                                                                                                                                                                                                                                                                                                  • Accepts transformers that only are fitted on a subset of the provided dataset.
                                                                                                                                                                                                                                                                                                                                                                  • Accepts transformers that apply only on the target column.
                                                                                                                                                                                                                                                                                                                                                                  • Uses transformers that are only applied on the training set to fit the pipeline, not to make predictions on new data.
                                                                                                                                                                                                                                                                                                                                                                  • The instance is considered fitted at initialization if all the underlying transformers/estimator in the pipeline are.
                                                                                                                                                                                                                                                                                                                                                                  • It returns attributes from the final estimator if they are not of the Pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • The last transformer is also cached.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  This Pipeline only works with estimators whose parameters for fit, transform, predict, etc... are named X and/or y.

                                                                                                                                                                                                                                                                                                                                                                  Parameterssteps: list of tuple List of (name, transform) tuples (implementing fit/transform) that are chained in sequential order.

                                                                                                                                                                                                                                                                                                                                                                  memory: str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute named_steps or steps to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time-consuming.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int or None, default=0 Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. If >0, the time elapsed while fitting each step is printed.

                                                                                                                                                                                                                                                                                                                                                                  Attributesnamed_steps: Bunch Dictionary-like object, with the following attributes. Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters.

                                                                                                                                                                                                                                                                                                                                                                  classes_: np.ndarray of shape (n_classes,) The class' labels. Only exist if the last step of the pipeline is a classifier.

                                                                                                                                                                                                                                                                                                                                                                  feature_names_in_: np.ndarray Names of features seen during first step fit method.

                                                                                                                                                                                                                                                                                                                                                                  n_features_in_: int Number of features seen during first step fit method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Initialize atom\n>>> atom = ATOMClassifier(X, y, verbose=2)\n\n<< ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 165 (1.2%)\n\n\n\n>>> # Apply data cleaning and feature engineering methods\n>>> atom.scale()\n\nFitting Scaler...\nScaling features...\n\n>>> atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --> Adding 116 samples to class 0.\n\n>>> atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 22 features from the dataset.\n   --> Dropping feature mean texture (rank 2).\n   --> Dropping feature mean smoothness (rank 3).\n   --> Dropping feature mean symmetry (rank 9).\n   --> Dropping feature texture error (rank 7).\n   --> Dropping feature smoothness error (rank 4).\n   --> Dropping feature concavity error (rank 5).\n   --> Dropping feature worst compactness (rank 8).\n   --> Dropping feature worst fractal dimension (rank 6).\n\n\n>>> # Train models\n>>> atom.run(models=\"LR\")\n\n\nTraining ========================= >>\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9787\nTime elapsed: 0.030s\n-------------------------------------------------\nTime: 0.030s\n\n\nFinal results ==================== >>\nTotal time: 0.033s\n-------------------------------------\nLogisticRegression --> f1: 0.9787\n\n\n>>> # Get the pipeline and make predictions\n>>> pl = atom.lr.export_pipeline()\n>>> print(pl.predict(X))\n\n[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1\n 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1\n 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1\n 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0\n 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1\n 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0\n 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1\n 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0\n 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1\n 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1\n 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1\n 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0\n 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 1 1 1 0 0 0 0 0 0 1]\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/pipeline/pipeline/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  decision_functionTransform, then decision_function of the final estimator.fitFit the pipeline.fit_predictTransform the data, and apply fit_predict with the final estimator.fit_transformFit the pipeline and transform the data.get_feature_names_outGet output feature names for transformation.get_paramsGet parameters for this estimator.inverse_transformInverse transform for each step in a reverse order.predictTransform, then predict of the final estimator.predict_log_probaTransform, then predict_log_proba of the final estimator.predict_probaTransform, then predict_proba of the final estimator.scoreTransform, then score of the final estimator.score_samplesTransform the data, and apply score_samples with the final estimator.set_outputSet the output container when \"transform\" and \"fit_transform\" are called.set_paramsSet the parameters of this estimator.transformTransform the data.

                                                                                                                                                                                                                                                                                                                                                                  method decision_function(X)[source]Transform, then decision_function of the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                                                                                                                                                                                                  Returnsnp.ndarray Predicted confidence scores.

                                                                                                                                                                                                                                                                                                                                                                  method fit(X=None, y=None, **fit_params)[source]Fit the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method fit_predict(X, y=None, **fit_params)[source]Transform the data, and apply fit_predict with the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX : iterable Training data. Must fulfill input requirements of first step of the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params : dict of string -> object Parameters passed to the fit method of each step, where each parameter name is prefixed such that parameter p for step s has key s__p.

                                                                                                                                                                                                                                                                                                                                                                  Returnsy_pred : ndarray Result of calling fit_predict on the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  method fit_transform(X=None, y=None, **fit_params)[source]Fit the pipeline and transform the data.

                                                                                                                                                                                                                                                                                                                                                                  Call fit followed by transform on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the transform method. Only valid if the final estimator implements transform. This also works when the final estimator is None, in which case all prior transformations are applied.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the estimator only uses y.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  **fit_params Additional keyword arguments for the fit method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method get_feature_names_out(input_features=None)[source]Get output feature names for transformation.

                                                                                                                                                                                                                                                                                                                                                                  Parametersinput_features : array-like of str or None, default=None Input features.

                                                                                                                                                                                                                                                                                                                                                                  Returnsfeature_names_out : ndarray of str objects Transformed feature names.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : mapping of string to any Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method inverse_transform(X=None, y=None)[source]Inverse transform for each step in a reverse order.

                                                                                                                                                                                                                                                                                                                                                                  All estimators in the pipeline must implement the inverse_transform method.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Name of the target column and sequence of values.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • If dataframe: Target columns for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  method predict(X, **predict_params)[source]Transform, then predict of the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                                                                                                                                                                                                  **predict_params Additional keyword arguments for the predict method. Note that while this may be used to return uncertainties from some models with return_std or return_cov, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  Returnsnp.ndarray Predicted classes with shape=(n_samples,).

                                                                                                                                                                                                                                                                                                                                                                  method predict_log_proba(X)[source]Transform, then predict_log_proba of the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                                                                                                                                                                                                  Returnsnp.ndarray Predicted class log-probabilities.

                                                                                                                                                                                                                                                                                                                                                                  method predict_proba(X)[source]Transform, then predict_proba of the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                                                                                                                                                                                                  Returnsnp.ndarray Predicted class probabilities.

                                                                                                                                                                                                                                                                                                                                                                  method score(X, y, sample_weight=None)[source]Transform, then score of the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence

                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y.

                                                                                                                                                                                                                                                                                                                                                                  Returnsfloat Mean accuracy or r2 of self.predict(X) with respect to y.

                                                                                                                                                                                                                                                                                                                                                                  method score_samples(X)[source]Transform the data, and apply score_samples with the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Returnsy_score : ndarray of shape (n_samples,) Result of calling score_samples on the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  method set_output(transform=None)[source]Set the output container when \"transform\" and \"fit_transform\" are called.

                                                                                                                                                                                                                                                                                                                                                                  Parameterstransform : {\"default\", \"pandas\"}, default=None Configure output of transform and fit_transform.

                                                                                                                                                                                                                                                                                                                                                                  • \"default\": Default output format of a transformer
                                                                                                                                                                                                                                                                                                                                                                  • \"pandas\": DataFrame output
                                                                                                                                                                                                                                                                                                                                                                  • None: Transform configuration is unchanged

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**kwargs)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs : dict Parameters of this estimator or parameters of estimators contained in steps. Parameters of the steps may be set using its name and the parameter name separated by a '__'.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : object Pipeline class instance.

                                                                                                                                                                                                                                                                                                                                                                  method transform(X=None, y=None, **kwargs)[source]Transform the data.

                                                                                                                                                                                                                                                                                                                                                                  Call transform on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the transform method. Only valid if the final estimator implements transform. This also works when the final estimator is None, in which case all prior transformations are applied.

                                                                                                                                                                                                                                                                                                                                                                  ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y.

                                                                                                                                                                                                                                                                                                                                                                  y: int, str, dict, sequence or None, default=None Target column corresponding to `X`.

                                                                                                                                                                                                                                                                                                                                                                  • If None: y is ignored.
                                                                                                                                                                                                                                                                                                                                                                  • If int: Position of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the target column in X.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Array with shape=(n_samples,) to use as target.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the _iter inner method.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdataframe Transformed feature set. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  series or dataframe Transformed target column. Only returned if provided.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_calibration/", "title": "plot_calibration", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_calibration(models=None, rows=\"test\", n_bins=10, target=0, title=None, legend=\"upper left\", figsize=(900, 900), filename=None, display=True)[source]Plot the calibration curve for a binary classifier.

                                                                                                                                                                                                                                                                                                                                                                  Well-calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance, a calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e., the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. This plot is available only for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the calibrate method to calibrate the winning model.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  n_bins: int, default=10 Number of bins used for calibration. Minimum of 5 required.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_calibration/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"RF\", \"LGB\"])\n>>> atom.plot_calibration()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_components/", "title": "plot_components", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_components(show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the explained variance ratio per component.

                                                                                                                                                                                                                                                                                                                                                                  Kept components are colored and discarded components are transparent. This plot is available only when feature selection was applied with strategy=\"pca\".

                                                                                                                                                                                                                                                                                                                                                                  Parametersshow: int or None, default=None Number of components to show. None to show all.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of components shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_pca Plot the explained variance ratio vs number of components.

                                                                                                                                                                                                                                                                                                                                                                  plot_rfecv Plot the rfecv results.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_components/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"pca\", n_features=5)\n>>> atom.plot_components(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_confusion_matrix/", "title": "plot_confusion_matrix", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_confusion_matrix(models=None, rows=\"test\", target=0, threshold=0.5, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot a model's confusion matrix.

                                                                                                                                                                                                                                                                                                                                                                  For one model, the plot shows a heatmap. For multiple models, it compares TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). This plot is available only for classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Fill the threshold parameter with the result from the model's get_best_threshold method to optimize the results.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the confusion matrix.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only for binary classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_calibration Plot the calibration curve for a binary classifier.

                                                                                                                                                                                                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_confusion_matrix/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, test_size=0.4)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.lr.plot_confusion_matrix()  # For one model\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_confusion_matrix()  # For multiple models\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_correlation/", "title": "plot_correlation", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_correlation(columns=None, method=\"pearson\", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]Plot a correlation matrix.

                                                                                                                                                                                                                                                                                                                                                                  Displays a heatmap showing the correlation between columns in the dataset. The colors red, blue and white stand for positive, negative, and no correlation respectively.

                                                                                                                                                                                                                                                                                                                                                                  Parameterscolumns: segment, sequence, dataframe or None, default=None Columns to plot. If None, plot all columns in the dataset. Selected categorical columns are ignored.

                                                                                                                                                                                                                                                                                                                                                                  method: str, default=\"pearson\" Method of correlation. Choose from: pearson, kendall or spearman.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(800, 700) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_correlation/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_correlation()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_det/", "title": "plot_det", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_det(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                                                                                                                                                                                                  Read more about DET in sklearn's documentation. Only available for binary classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_det/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_det()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_distribution/", "title": "plot_distribution", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_distribution(columns=0, distributions=\"kde\", show=None, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot column distributions.

                                                                                                                                                                                                                                                                                                                                                                  • For numerical columns, plot the probability density distribution. Additionally, it's possible to plot any of scipy.stats distributions fitted to the column.
                                                                                                                                                                                                                                                                                                                                                                  • For categorical columns, plot the class distribution. Only one categorical column can be plotted at the same time.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use atom's distribution method to check which distribution fits the column best.

                                                                                                                                                                                                                                                                                                                                                                  Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. It's only possible to plot one categorical column. If more than one categorical column is selected, all categorical columns are ignored.

                                                                                                                                                                                                                                                                                                                                                                  distributions: str, sequence or None, default=\"kde\" Distributions to fit. Only for numerical columns.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No distribution is fit.
                                                                                                                                                                                                                                                                                                                                                                  • If \"kde\": Fit a Gaussian kde distribution.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Name of a scipy.stats distribution.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of classes (ordered by number of occurrences) to show in the plot. If None, it shows all classes. Only for categorical columns.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_distribution/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> # Add a categorical feature\n>>> animals = [\"cat\", \"dog\", \"bird\", \"lion\", \"zebra\"]\n>>> probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]\n>>> X[\"animals\"] = np.random.choice(animals, size=len(X), p=probabilities)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_distribution(columns=[0, 1])\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_distribution(columns=0, distributions=[\"norm\", \"invgauss\"])\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_distribution(columns=\"animals\")\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_edf/", "title": "plot_edf", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_edf(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                                                                                                                                                                                                  Use this plot to analyze and improve hyperparameter search spaces. The EDF assumes that the value of the objective function is in accordance with the uniform distribution over the objective space. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  Only complete trials are considered when plotting the EDF.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_edf/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from optuna.distributions import IntDistribution\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n\n>>> # Run three models with different search spaces\n>>> atom.run(\n...     models=\"RF_1\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(6, 10)}},\n... )\n>>> atom.run(\n...     models=\"RF_2\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(11, 15)}},\n... )\n>>> atom.run(\n...     models=\"RF_3\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(16, 20)}},\n... )\n\n>>> atom.plot_edf()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_errors/", "title": "plot_errors", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_errors(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's prediction errors.

                                                                                                                                                                                                                                                                                                                                                                  Plot the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This plot can be useful to detect noise or heteroscedasticity along a range of the target domain. This plot is available only for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_residuals Plot a model's residuals.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_errors/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run([\"OLS\", \"LGB\"])\n>>> atom.plot_errors()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_evals/", "title": "plot_evals", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_evals(models=None, dataset=\"test\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot evaluation curves.

                                                                                                                                                                                                                                                                                                                                                                  The evaluation curves are the main metric scores achieved by the models at every iteration of the training process. This plot is available only for models that allow in-training validation.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  dataset: str, default=\"test\" Data set for which to plot the evaluation curves. Use + between options to select more than one. Choose from: \"train\", \"test\".

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_evals/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"XGB\", \"LGB\"])\n>>> atom.plot_evals()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_feature_importance/", "title": "plot_feature_importance", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_feature_importance(models=None, show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot a model's feature importance.

                                                                                                                                                                                                                                                                                                                                                                  The sum of importances for all features (per model) is 1. This plot is available only for models whose estimator has a scores_, feature_importances_ or coef attribute.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_feature_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_feature_importance(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_forecast/", "title": "plot_forecast", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_forecast(models=None, fh=\"test\", X=None, target=0, plot_interval=True, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a time series with model forecasts.

                                                                                                                                                                                                                                                                                                                                                                  This plot is only available for forecasting tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. If no models are selected, only the target column is plotted.

                                                                                                                                                                                                                                                                                                                                                                  fh: hashable, segment, sequence or ForecastingHorizon, default=\"test\" Forecast horizon for which to plot the predictions.

                                                                                                                                                                                                                                                                                                                                                                  X: dataframe-like or None, default=None Exogenous time series corresponding to fh. This parameter is ignored if fh is a data set.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multivariate tasks.

                                                                                                                                                                                                                                                                                                                                                                  plot_interval: bool, default=True Whether to plot prediction intervals instead of the exact prediction values. If True, the plotted estimators should have a predict_interval method.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_forecast/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMForecaster\n>>> from sktime.datasets import load_airline\n\n>>> y = load_airline()\n\n>>> atom = ATOMForecaster(y, random_state=1)\n>>> atom.plot_forecast()\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.run(\n...     models=\"arima\",\n...     est_params={\"order\": (1, 1, 0), \"seasonal_order\": (0, 1, 0, 12)},\n... )\n>>> atom.plot_forecast()\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_forecast(fh=\"train+test\", plot_interval=False)\n
                                                                                                                                                                                                                                                                                                                                                                  >>> # Forecast the next 4 years starting from the test set\n>>> atom.plot_forecast(fh=range(1, 48))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_gains/", "title": "plot_gains", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_gains(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the cumulative gains curve.

                                                                                                                                                                                                                                                                                                                                                                  This plot is available only for binary and multilabel classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_gains/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_gains()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameter_importance/", "title": "plot_hyperparameter_importance", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a model's hyperparameter importance.

                                                                                                                                                                                                                                                                                                                                                                  The hyperparameter importances are calculated using the fANOVA importance evaluator. The sum of all importances for all parameters (per model) is 1. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of hyperparameters (ordered by importance) to show. None to show all.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameter_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"ET\", \"RF\"], n_trials=10)\n>>> atom.plot_hyperparameter_importance()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameters/", "title": "plot_hyperparameters", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  A model's hyperparameters are plotted against each other. The corresponding metric scores are displayed in a contour plot. The markers are the trials in the study. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_hyperparameters().

                                                                                                                                                                                                                                                                                                                                                                  params: str, segment or sequence, default=(0, 1) Hyperparameters to plot. Use a sequence or add + between options to select more than one.

                                                                                                                                                                                                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameter_importance Plot a model's hyperparameter importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_hyperparameters/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\", n_trials=15)\n>>> atom.plot_hyperparameters(params=(0, 1, 2))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_learning_curve/", "title": "plot_learning_curve", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_learning_curve(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the learning curve: score vs number of training samples.

                                                                                                                                                                                                                                                                                                                                                                  This plot is available only for models fitted using train sizing. Ensembles are ignored.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                                                                                                                                                                                                  plot_successive_halving Plot scores per iteration of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_learning_curve/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.train_sizing([\"LR\", \"RF\"], n_bootstrap=5)\n>>> atom.plot_learning_curve()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_lift/", "title": "plot_lift", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_lift(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  Only available for binary classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_lift/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_lift()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_ngrams/", "title": "plot_ngrams", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_ngrams(ngram=\"bigram\", rows=\"dataset\", show=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot n-gram frequencies.

                                                                                                                                                                                                                                                                                                                                                                  The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised. If the documents are not tokenized, the words are separated by spaces.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use atom's tokenize method to separate the words creating n-grams based on their frequency in the corpus.

                                                                                                                                                                                                                                                                                                                                                                  Parametersngram: str or int, default=\"bigram\" Number of contiguous words to search for (size of n-gram). Choose from: word (1), bigram (2), trigram (3), quadgram (4).

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the search.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=10 Number of n-grams (ordered by number of occurrences) to show in the plot. If none, show all n-grams (up to 200).

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of n-grams shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_wordcloud Plot a wordcloud from the corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_ngrams/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.textclean()\n>>> atom.textnormalize()\n>>> atom.plot_ngrams()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_parallel_coordinate/", "title": "plot_parallel_coordinate", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  Every line of the plot represents one trial. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_parallel_coordinate().

                                                                                                                                                                                                                                                                                                                                                                  params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add + between options to select more than one. If None, all the model's hyperparameters are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int or str, default=0 Metric to plot (only for multi-metric runs).

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameter_importance Plot a model's hyperparameter importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_parallel_coordinate/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"RF\", n_trials=15)\n>>> atom.plot_parallel_coordinate(params=slice(1, 5))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pareto_front/", "title": "plot_pareto_front", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the Pareto front of a study.

                                                                                                                                                                                                                                                                                                                                                                  Shows the trial scores plotted against each other. The marker's colors indicate the trial number. This plot is only available for models with multi-metric runs and hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_pareto_front().

                                                                                                                                                                                                                                                                                                                                                                  metric: str, sequence or None, default=None Metrics to plot. Use a sequence or add + between options to select more than one. If None, the metrics used to run the pipeline are selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of metrics shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_slice Plot the parameter relationship in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pareto_front/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"accuracy\", \"recall\"],\n...     n_trials=15,\n...  )\n>>> atom.plot_pareto_front()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_parshap/", "title": "plot_parshap", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_parshap(models=None, columns=None, target=1, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  Plots the train and test correlation between the shap value of every feature with its target value, after removing the effect of all other features (partial correlation). This plot is useful to identify the features that are contributing most to overfitting. Features that lie below the bisector (diagonal line) performed worse on the test set than on the training set. If the estimator has a scores_, feature_importances_ or coef_ attribute, its normalized values are shown in a color map.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe or None, default=None XSelector to plot. If None, it plots all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_parshap/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"RF\"])\n>>> atom.rf.plot_parshap(legend=None)\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_parshap(columns=slice(5, 10))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_partial_dependence/", "title": "plot_partial_dependence", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_partial_dependence(models=None, columns=(0, 1, 2), kind=\"average\", pair=None, target=1, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial dependence of features.

                                                                                                                                                                                                                                                                                                                                                                  The partial dependence of a feature (or a set of features) corresponds to the response of the model for each possible value of the feature. The plot can take two forms:

                                                                                                                                                                                                                                                                                                                                                                  • If pair is None: Single feature partial dependence lines. The deciles of the feature values are shown with tick marks on the bottom.
                                                                                                                                                                                                                                                                                                                                                                  • If pair is defined: Two-way partial dependence plots are plotted as contour plots (only allowed for a single model).

                                                                                                                                                                                                                                                                                                                                                                  Read more about partial dependence on sklearn's documentation. This plot is not available for multilabel nor multiclass-multioutput classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  columns: int, str, segment, sequence, dataframe, default=(0, 1, 2) XSelector to get the partial dependence from.

                                                                                                                                                                                                                                                                                                                                                                  kind: str or sequence, default=\"average\" Kind of dependence to plot. Use a sequence or add + between options to select more than one. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"average\": Partial dependence averaged across all samples in the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • \"individual\": Partial dependence for up to 50 random samples (Individual Conditional Expectation).

                                                                                                                                                                                                                                                                                                                                                                  This parameter is ignored when plotting feature pairs.

                                                                                                                                                                                                                                                                                                                                                                  pair: int, str or None, default=None Feature with which to pair the features selected by columns. If specified, the resulting figure displays contour plots. Only allowed when plotting a single model. If None, the plots show the partial dependence of single features.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=1 Class in the target column to look at (only for multiclass classification tasks).

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  plot_permutation_importance Plot the feature permutation importance of models.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_partial_dependence/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_partial_dependence(kind=\"average+individual\", legend=\"upper left\")\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pca/", "title": "plot_pca", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the explained variance ratio vs number of components.

                                                                                                                                                                                                                                                                                                                                                                  If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the selected components. The star marks the number of components selected by the user. This plot is available only when feature selection was applied with strategy=\"pca\".

                                                                                                                                                                                                                                                                                                                                                                  Parameterstitle: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_components Plot the explained variance ratio per component.

                                                                                                                                                                                                                                                                                                                                                                  plot_rfecv Plot the rfecv results.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pca/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"pca\", n_features=5)\n>>> atom.plot_pca()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_permutation_importance/", "title": "plot_permutation_importance", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the feature permutation importance of models.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  This method can be slow. Results are cached to fasten repeated calls.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  n_repeats: int, default=10 Number of times to permute each feature.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_feature_importance Plot a model's feature importance.

                                                                                                                                                                                                                                                                                                                                                                  plot_partial_dependence Plot the partial dependence of features.

                                                                                                                                                                                                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_permutation_importance/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_permutation_importance(show=10, n_repeats=7)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pipeline/", "title": "plot_pipeline", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a diagram of the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  This plot uses the schemdraw package, which is incompatible with plotly. The returned plot is therefore a matplotlib figure.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models for which to draw the pipeline. If None, all pipelines are plotted.

                                                                                                                                                                                                                                                                                                                                                                  draw_hyperparameter_tuning: bool, default=True Whether to draw if the models used Hyperparameter Tuning.

                                                                                                                                                                                                                                                                                                                                                                  color_branches: bool or None, default=None Whether to draw every branch in a different color. If None, branches are colored when there is more than one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the pipeline drawn.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_wordcloud Plot a wordcloud from the corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_pipeline/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"RNN\", \"SGD\", \"MLP\"])\n>>> atom.voting(models=atom.winners[:2])\n>>> atom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.scale()\n>>> atom.prune()\n>>> atom.run(\"RF\", n_trials=30)\n\n>>> atom.branch = \"undersample\"\n>>> atom.balance(\"nearmiss\")\n>>> atom.run(\"RF_undersample\")\n\n>>> atom.branch = \"oversample_from_main\"\n>>> atom.balance(\"smote\")\n>>> atom.run(\"RF_oversample\")\n\n>>> atom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_prc/", "title": "plot_prc", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_prc(models=None, rows=\"test\", target=0, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  Read more about PRC in sklearn's documentation. Only available for binary classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_det Plot the Detection Error Tradeoff curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_roc Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_prc/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_prc()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_probabilities/", "title": "plot_probabilities", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_probabilities(models=None, rows=\"test\", target=1, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the probability distribution of the target classes.

                                                                                                                                                                                                                                                                                                                                                                  This plot is available only for models with a predict_proba method in classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Probability of being that class in the target column. For multioutput tasks, the value should be a tuple of the form (column, class).

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_probabilities/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_probabilities()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_qq/", "title": "plot_qq", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_qq(columns=0, distributions=\"norm\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a quantile-quantile plot.

                                                                                                                                                                                                                                                                                                                                                                  Columns are distinguished by color and the distributions are distinguished by marker type. Missing values are ignored.

                                                                                                                                                                                                                                                                                                                                                                  Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. Selected categorical columns are ignored.

                                                                                                                                                                                                                                                                                                                                                                  distributions: str or sequence, default=\"norm\" Names of the scipy.stats distributions to fit to the columns.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                                                                                                                                                                                                  plot_relationships Plot pairwise relationships in a dataset.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_qq/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_qq(columns=[5, 6])\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_qq(columns=0, distributions=[\"norm\", \"invgauss\", \"triang\"])\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_relationships/", "title": "plot_relationships", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]Plot pairwise relationships in a dataset.

                                                                                                                                                                                                                                                                                                                                                                  Creates a grid of axes such that each numerical column appears once on the x-axes and once on the y-axes. The bottom triangle contains scatter plots (max 250 random samples), the diagonal plots contain column distributions, and the upper triangle contains contour histograms for all samples in the columns.

                                                                                                                                                                                                                                                                                                                                                                  Parameterscolumns: segment, sequence or dataframe, default=(0, 1, 2) Columns to plot. Selected categorical columns are ignored.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_correlation Plot a correlation matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_distribution Plot column distributions.

                                                                                                                                                                                                                                                                                                                                                                  plot_qq Plot a quantile-quantile plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_relationships/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.plot_relationships(columns=[0, 4, 5])\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_residuals/", "title": "plot_residuals", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_residuals(models=None, rows=\"test\", target=0, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's residuals.

                                                                                                                                                                                                                                                                                                                                                                  The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the regressor's errors. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. This plot is only available for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_errors Plot a model's prediction errors.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_residuals/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run([\"OLS\", \"LGB\"])\n>>> atom.plot_residuals()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_results/", "title": "plot_results", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_results(models=None, metric=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the model results.

                                                                                                                                                                                                                                                                                                                                                                  If all models applied bootstrap, the plot is a boxplot. If not, the plot is a barplot. Models are ordered based on their score from the top down. The score is either the [metric]_bootstrap or [metric]_test values, selected in that order.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Other available options are: \"time_bo\", \"time_fit\", \"time_bootstrap\", \"time\". If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of models.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_probabilities Plot the probability distribution of the target classes.

                                                                                                                                                                                                                                                                                                                                                                  plot_threshold Plot metric performances against threshold values.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_results/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"])\n>>> atom.plot_results()\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"], n_bootstrap=5)\n>>> atom.plot_results()\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_results(metric=\"time_fit+time\")\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_rfecv/", "title": "plot_rfecv", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the rfecv results.

                                                                                                                                                                                                                                                                                                                                                                  Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy=\"rfecv\".

                                                                                                                                                                                                                                                                                                                                                                  Parameterstitle: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_components Plot the explained variance ratio per component.

                                                                                                                                                                                                                                                                                                                                                                  plot_pca Plot the explained variance ratio vs number of components.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_rfecv/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.feature_selection(\"rfecv\", solver=\"Tree\")\n>>> atom.plot_rfecv()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_roc/", "title": "plot_roc", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_roc(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Receiver Operating Characteristics curve.

                                                                                                                                                                                                                                                                                                                                                                  Read more about ROC in sklearn's documentation. Only available for classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the data set to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Names of the data sets to plot.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Names of the sets with corresponding selection of rows as values.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_gains Plot the cumulative gains curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_lift Plot the lift curve.

                                                                                                                                                                                                                                                                                                                                                                  plot_prc Plot the precision-recall curve.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_roc/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_roc()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_bar/", "title": "plot_shap_bar", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_bar(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's bar plot.

                                                                                                                                                                                                                                                                                                                                                                  Create a bar plot of a set of SHAP values. If a single sample is passed, then the SHAP values are plotted. If many samples are passed, then the mean absolute value for each feature column is plotted. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_bar().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_bar/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_bar(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_beeswarm/", "title": "plot_shap_beeswarm", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_beeswarm(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  The plot is colored by feature values. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_beeswarm().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_beeswarm method does not support plotting a single sample.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_parshap Plot the partial correlation of shap values.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_beeswarm/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_beeswarm(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_decision/", "title": "plot_shap_decision", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_decision(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's decision plot.

                                                                                                                                                                                                                                                                                                                                                                  Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values are printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_decision().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_decision/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_decision(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  >>> atom.plot_shap_decision(rows=-1, show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_force/", "title": "plot_shap_force", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_force(models=None, rows=\"test\", target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]Plot SHAP's force plot.

                                                                                                                                                                                                                                                                                                                                                                  Visualize the given SHAP values with an additive force layout. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only a single sample is plotted). Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_force().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=(900, 300) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure (only if matplotlib=True in kwargs).

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for shap.plots.force.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_scatter Plot SHAP's scatter plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_force/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_force(rows=-2, matplotlib=True, figsize=(1800, 300))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_heatmap/", "title": "plot_shap_heatmap", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_heatmap(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's heatmap plot.

                                                                                                                                                                                                                                                                                                                                                                  This plot is designed to show the population substructure of a dataset using supervised clustering and a heatmap. Supervised clustering involves clustering data points not by their original feature values but by their explanations. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_heatmap().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_heatmap method does not support plotting a single sample.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_waterfall Plot SHAP's waterfall plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_heatmap/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_heatmap(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_scatter/", "title": "plot_shap_scatter", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_scatter(models=None, rows=\"test\", columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot SHAP's scatter plot.

                                                                                                                                                                                                                                                                                                                                                                  Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_scatter().

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_scatter method does not support plotting a single sample.

                                                                                                                                                                                                                                                                                                                                                                  columns: int or str, default=0 Column to plot.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_decision Plot SHAP's decision plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_force Plot SHAP's force plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_scatter/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_scatter(columns=\"symmetry error\")\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_waterfall/", "title": "plot_shap_waterfall", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_shap_waterfall(models=None, rows=0, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's waterfall plot.

                                                                                                                                                                                                                                                                                                                                                                  The SHAP value of a feature represents the impact of the evidence provided by that feature on the model\u2019s output. The waterfall plot is designed to visually display how the SHAP values (evidence) of each feature move the model output from our prior expectation under the background data distribution, to the final model prediction given the evidence of all the features. Features are sorted by the magnitude of their SHAP values with the smallest magnitude features grouped together at the bottom of the plot when the number of features in the models exceeds the show parameter. Read more about SHAP plots in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_waterfall().

                                                                                                                                                                                                                                                                                                                                                                  rows: int or str, default=0 Selection of rows to plot. The plot_shap_waterfall method does not support plotting multiple samples.

                                                                                                                                                                                                                                                                                                                                                                  show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features.

                                                                                                                                                                                                                                                                                                                                                                  target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as png. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsplt.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_bar Plot SHAP's bar plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_beeswarm Plot SHAP's beeswarm plot.

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_heatmap Plot SHAP's heatmap plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_shap_waterfall/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"LR\")\n>>> atom.plot_shap_waterfall(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_slice/", "title": "plot_slice", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the parameter relationship in a study.

                                                                                                                                                                                                                                                                                                                                                                  The color of the markers indicates the trial. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_slice().

                                                                                                                                                                                                                                                                                                                                                                  params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add + between options to select more than one. If None, all the model's hyperparameters are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int or str, default=None Metric to plot (only for multi-metric runs). If str, add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_slice/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"recall\"],\n...     n_trials=15,\n... )\n>>> atom.plot_slice(params=(0, 1, 2))\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_successive_halving/", "title": "plot_successive_halving", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_successive_halving(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot scores per iteration of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  Only use with models fitted using successive halving. Ensembles are ignored.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_learning_curve Plot the learning curve: score vs number of training samples.

                                                                                                                                                                                                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_successive_halving/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import load_breast_cancer\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.successive_halving([\"Tree\", \"Bag\", \"RF\", \"LGB\"], n_bootstrap=5)\n>>> atom.plot_successive_halving()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_terminator_improvement/", "title": "plot_terminator_improvement", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_terminator_improvement(models=None, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the potentials for future objective improvement.

                                                                                                                                                                                                                                                                                                                                                                  This function visualizes the objective improvement potentials. It helps to determine whether you should continue the optimization or not. The evaluated error is also plotted. Note that this function may take some time to compute the improvement potentials. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  • The plot_terminator_improvement method is only available for models that ran hyperparameter tuning using cross-validation, e.g., using ht_params={'cv': 5}.
                                                                                                                                                                                                                                                                                                                                                                  • This method does not support [multi-objective optimizations][multi-metric runs].
                                                                                                                                                                                                                                                                                                                                                                  • The calculation of the improvement can be slow. Set the memory parameter to cache the results and speed up repeated calls.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y)

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_pareto_front Plot the Pareto front of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_timeline Plot the timeline of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_trials Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_terminator_improvement/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\"RF\", n_trials=10, ht_params={\"cv\": 5})\n>>> atom.plot_terminator_improvement()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_threshold/", "title": "plot_threshold", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_threshold(models=None, metric=None, rows=\"test\", target=0, steps=100, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot metric performances against threshold values.

                                                                                                                                                                                                                                                                                                                                                                  This plot is available only for models with a predict_proba method in a binary or multilabel classification task.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric to plot. Choose from any of sklearn's scorers, a function with signature metric(y_true, y_pred), a scorer object or a sequence of these. Use a sequence or add + between options to select more than one. If None, the metric used to run the pipeline is selected.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows on which to calculate the metric.

                                                                                                                                                                                                                                                                                                                                                                  target: int or str, default=0 Target column to look at. Only for multilabel tasks.

                                                                                                                                                                                                                                                                                                                                                                  steps: int, default=100 Number of thresholds measured.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_calibration Plot the calibration curve for a binary classifier.

                                                                                                                                                                                                                                                                                                                                                                  plot_confusion_matrix Plot a model's confusion matrix.

                                                                                                                                                                                                                                                                                                                                                                  plot_probabilities Plot the probability distribution of the target classes.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_threshold/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"LR\", \"RF\"])\n>>> atom.plot_threshold()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_timeline/", "title": "plot_timeline", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_timeline(models=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the timeline of a study.

                                                                                                                                                                                                                                                                                                                                                                  This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y)

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_edf Plot the Empirical Distribution Function of a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_slice Plot the parameter relationship in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_terminator_improvement Plot the potentials for future objective improvement.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_timeline/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from optuna.pruners import PatientPruner\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run(\n...     models=\"LGB\",\n...     n_trials=15,\n...     ht_params={\"pruner\": PatientPruner(None, patience=2)},\n... )\n>>> atom.plot_timeline()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_trials/", "title": "plot_trials", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_trials(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 800), filename=None, display=True)[source]Plot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  Creates a figure with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. The best trial is indicated with a star. This is the same plot as produced by ht_params={\"plot\": True}. This plot is only available for models that ran hyperparameter tuning.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected.

                                                                                                                                                                                                                                                                                                                                                                  metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Add + between options to select more than one. If None, all metrics are selected.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 800) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_evals Plot evaluation curves.

                                                                                                                                                                                                                                                                                                                                                                  plot_hyperparameters Plot hyperparameter relationships in a study.

                                                                                                                                                                                                                                                                                                                                                                  plot_results Plot the model results.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_trials/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.run([\"ET\", \"RF\"], n_trials=15)\n>>> atom.plot_trials()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_wordcloud/", "title": "plot_wordcloud", "text": "

                                                                                                                                                                                                                                                                                                                                                                  method plot_wordcloud(rows=\"dataset\", title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]Plot a wordcloud from the corpus.

                                                                                                                                                                                                                                                                                                                                                                  The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the wordcloud.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for the Wordcloud object.

                                                                                                                                                                                                                                                                                                                                                                  Returnsgo.Figure or None Plot object. Only returned if display=None.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  plot_ngrams Plot n-gram frequencies.

                                                                                                                                                                                                                                                                                                                                                                  plot_pipeline Plot a diagram of the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/plots/plot_wordcloud/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> import numpy as np\n>>> from atom import ATOMClassifier\n>>> from sklearn.datasets import fetch_20newsgroups\n\n>>> X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n>>> X = np.array(X).reshape(-1, 1)\n\n>>> atom = ATOMClassifier(X, y, random_state=1)\n>>> atom.textclean()\n>>> atom.textnormalize()\n>>> atom.plot_wordcloud()\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/", "title": "DirectClassifier", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingClassifier Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingClassifier Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import DirectClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = DirectClassifier(models=[\"LR\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: LR, RF\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.992\nTest evaluation --> f1: 0.9767\nTime elapsed: 0.104s\n-------------------------------------------------\nTime: 0.104s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.968\nTime elapsed: 0.204s\n-------------------------------------------------\nTime: 0.204s\n\n\nFinal results ==================== >>\nTotal time: 0.314s\n-------------------------------------\nLogisticRegression --> f1: 0.9767 !\nRandomForest       --> f1: 0.968\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n    f1_train  f1_test  time_fit      time\nLR     0.992   0.9767  0.104497  0.104497\nRF     1.000   0.9680  0.204185  0.204185\n\n\n>>> print(runner.evaluate())\n\n    accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR    0.9708  0.9976  0.9702  0.9767   0.9545  0.9374     0.9813  0.9722  0.9959\nRF    0.9591  0.9490  0.9511  0.9680   0.9381  0.9118     0.9550  0.9815  0.9511\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/", "title": "DirectForecaster", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.DirectForecaster(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingForecaster Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import DirectForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = DirectForecaster(models=[\"ES\", \"ETS\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: ES, ETS\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0868\nTest evaluation --> mape: -0.2018\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.041s\n-------------------------------------\nExponentialSmoothing --> mape: -0.2018 !\nETS                  --> mape: -0.202\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n     mape_train  mape_test  time_fit      time\nES      -0.0868    -0.2018  0.019017  0.019017\nETS     -0.0863    -0.2020  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n         mae    mape        mse      r2     rmse\nES  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\nETS -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/", "title": "DirectRegressor", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model:

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingRegressor Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import DirectRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = DirectRegressor(models=[\"OLS\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nModels: OLS, RF\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5991\nTest evaluation --> r2: 0.5765\nTime elapsed: 0.154s\n-------------------------------------------------\nTime: 0.154s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9803\nTest evaluation --> r2: 0.8803\nTime elapsed: 1.594s\n-------------------------------------------------\nTime: 1.594s\n\n\nFinal results ==================== >>\nTotal time: 1.749s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5765\nRandomForest         --> r2: 0.8803 !\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5991   0.5765  0.153989  0.153989\nRF     0.9803   0.8803  1.594449  1.594449\n\n\n>>> print(runner.evaluate())\n\n        mae          mape     mse      r2    rmse\nOLS -1.4553 -9.184808e+14 -3.4564  0.5765 -1.8591\nRF  -0.6098 -2.854782e+14 -0.9773  0.8803 -0.9886\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/directregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/", "title": "SuccessiveHalvingClassifier", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier Main class for classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectClassifier Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingClassifier Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = SuccessiveHalvingClassifier([\"LR\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR2, RF2\nSize of training set: 398 (50%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.996\nTest evaluation --> f1: 0.9677\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== >>\nTotal time: 0.228s\n-------------------------------------\nLogisticRegression --> f1: 0.9677 !\nRandomForest       --> f1: 0.9444\n\n\nRun: 1 =========================== >>\nModels: LR1\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.994\nTest evaluation --> f1: 0.9818\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== >>\nTotal time: 0.098s\n-------------------------------------\nLogisticRegression --> f1: 0.9818\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.5  LR2       0.996   0.9677  0.086078  0.086078\n     RF2       1.000   0.9444  0.137125  0.137125\n1.0  LR1       0.994   0.9818  0.094800  0.094800\n\n\n>>> print(runner.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR2    0.9591  0.9963  0.9609  0.9677   0.9375  0.9124     0.9813  0.9545  0.9937\nRF2    0.9298  0.9391  0.9308  0.9444   0.8947  0.8504     0.9623  0.9273  0.9308\nLR1    0.9766  0.9972  0.9745  0.9818   0.9643  0.9490     0.9818  0.9818  0.9952\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/", "title": "SuccessiveHalvingForecaster", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.SuccessiveHalvingForecaster(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectForecaster Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingForecaster Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = SuccessiveHalvingForecaster([\"ETS\", \"ES\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: mape\n\n\nRun: 0 =========================== >>\nModels: ETS2, ES2\nSize of training set: 115 (50%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0879\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0879\nTest evaluation --> mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.039s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 1 =========================== >>\nModels: ETS1\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== >>\nTotal time: 0.021s\n-------------------------------------\nETS --> mape: -0.202\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.5  ES2       -0.0879     -0.202  0.017015  0.017015\n     ETS2      -0.0879     -0.202  0.020018  0.020018\n1.0  ETS1      -0.0863     -0.202  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n          mae   mape        mse      r2     rmse\nETS2 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\nES2  -81.4483 -0.202 -8673.9309 -0.4209 -93.1339\nETS1 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/", "title": "SuccessiveHalvingRegressor", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                                                                                                                                                                                                  skip_runs: int, default=0 Skip last skip_runs runs of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  TrainSizingRegressor Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import SuccessiveHalvingRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = SuccessiveHalvingRegressor([\"OLS\", \"RF\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: r2\n\n\nRun: 0 =========================== >>\nModels: OLS2, RF2\nSize of training set: 1257 (50%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6083\nTest evaluation --> r2: -2.168057727555873e+23\nTime elapsed: 0.146s\n-------------------------------------------------\nTime: 0.146s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9685\nTest evaluation --> r2: 0.7924\nTime elapsed: 0.913s\n-------------------------------------------------\nTime: 0.913s\n\n\nFinal results ==================== >>\nTotal time: 1.061s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -2.168057727555873e+23 ~\nRandomForest         --> r2: 0.7924 !\n\n\nRun: 1 =========================== >>\nModels: RF1\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.9802\nTest evaluation --> r2: 0.8692\nTime elapsed: 1.571s\n-------------------------------------------------\nTime: 1.571s\n\n\nFinal results ==================== >>\nTotal time: 1.573s\n-------------------------------------\nRandomForest --> r2: 0.8692\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.5  OLS2     0.6083 -2.168058e+23  0.146151  0.146151\n     RF2      0.9685  7.924000e-01  0.912829  0.912829\n1.0  RF1      0.9802  8.692000e-01  1.571428  1.571428\n\n\n>>> print(runner.evaluate())\n\n               mae          mape           mse            r2          rmse\nOLS2 -1.375810e+11 -6.979478e+14 -1.715067e+24 -2.168058e+23 -1.309606e+12\nRF2  -8.656000e-01 -3.503634e+14 -1.642300e+00  7.924000e-01 -1.281500e+00\nRF1  -6.385000e-01 -1.768080e+14 -1.034400e+00  8.692000e-01 -1.017000e+00\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/successivehalvingregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/", "title": "TrainSizingClassifier", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, a default metric is selected for every task:

                                                                                                                                                                                                                                                                                                                                                                  • \"f1\" for binary classification
                                                                                                                                                                                                                                                                                                                                                                  • \"f1_weighted\" for multiclass(-multioutput) classification
                                                                                                                                                                                                                                                                                                                                                                  • \"average_precision\" for multilabel classification

                                                                                                                                                                                                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import TrainSizingClassifier\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = TrainSizingClassifier(models=\"LR\", verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR02\nSize of training set: 79 (20%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9899\nTest evaluation --> f1: 0.9455\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== >>\nTotal time: 0.089s\n-------------------------------------\nLogisticRegression --> f1: 0.9455\n\n\nRun: 1 =========================== >>\nModels: LR04\nSize of training set: 159 (40%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9898\nTest evaluation --> f1: 0.9727\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== >>\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --> f1: 0.9727\n\n\nRun: 2 =========================== >>\nModels: LR06\nSize of training set: 238 (60%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9936\nTest evaluation --> f1: 0.9683\nTime elapsed: 0.085s\n-------------------------------------------------\nTime: 0.085s\n\n\nFinal results ==================== >>\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --> f1: 0.9683\n\n\nRun: 3 =========================== >>\nModels: LR08\nSize of training set: 318 (80%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9901\nTest evaluation --> f1: 0.9817\nTime elapsed: 0.096s\n-------------------------------------------------\nTime: 0.096s\n\n\nFinal results ==================== >>\nTotal time: 0.099s\n-------------------------------------\nLogisticRegression --> f1: 0.9817\n\n\nRun: 4 =========================== >>\nModels: LR10\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.992\nTest evaluation --> f1: 0.9772\nTime elapsed: 0.099s\n-------------------------------------------------\nTime: 0.099s\n\n\nFinal results ==================== >>\nTotal time: 0.102s\n-------------------------------------\nLogisticRegression --> f1: 0.9772\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.2  LR02     0.9899   0.9455  0.086078  0.086078\n0.4  LR04     0.9898   0.9727  0.086078  0.086078\n0.6  LR06     0.9936   0.9683  0.085077  0.085077\n0.8  LR08     0.9901   0.9817  0.095865  0.095865\n1.0  LR10     0.9920   0.9772  0.098852  0.098852\n\n\n>>> print(runner.evaluate())\n\n      accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR02    0.9298  0.9916  0.9180  0.9455   0.8966  0.8483     0.9286  0.9630  0.9857\nLR04    0.9649  0.9971  0.9557  0.9727   0.9469  0.9248     0.9554  0.9907  0.9950\nLR06    0.9591  0.9976  0.9478  0.9683   0.9386  0.9124     0.9469  0.9907  0.9959\nLR08    0.9766  0.9963  0.9716  0.9817   0.9640  0.9497     0.9727  0.9907  0.9938\nLR10    0.9708  0.9973  0.9636  0.9772   0.9554  0.9372     0.9640  0.9907  0.9954\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingclassifier/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingclassifier/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/", "title": "TrainSizingForecaster", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.TrainSizingForecaster(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric mean_absolute_percentage_error is selected.

                                                                                                                                                                                                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMForecaster Main class for forecasting tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectForecaster Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import TrainSizingForecaster\n>>> from sktime.datasets import load_airline\n>>> from sktime.split import temporal_train_test_split\n\n>>> y = load_airline()\n\n>>> train, test = temporal_train_test_split(y, test_size=0.2)\n\n>>> runner = TrainSizingForecaster([\"ETS\", \"ES\"], verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: mape\n\n\nRun: 0 =========================== >>\nModels: ETS02, ES02\nSize of training set: 23 (20%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0889\nTest evaluation --> mape: -0.202\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0889\nTest evaluation --> mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.041s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 1 =========================== >>\nModels: ETS04, ES04\nSize of training set: 46 (40%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0871\nTest evaluation --> mape: -0.202\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0871\nTest evaluation --> mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.039s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 2 =========================== >>\nModels: ETS06, ES06\nSize of training set: 69 (60%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0861\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0867\nTest evaluation --> mape: -0.2016\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== >>\nTotal time: 0.038s\n-------------------------------------\nETS                  --> mape: -0.202\nExponentialSmoothing --> mape: -0.2016 !\n\n\nRun: 3 =========================== >>\nModels: ETS08, ES08\nSize of training set: 92 (80%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0842\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0845\nTest evaluation --> mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.040s\n-------------------------------------\nETS                  --> mape: -0.202 !\nExponentialSmoothing --> mape: -0.202 !\n\n\nRun: 4 =========================== >>\nModels: ETS10, ES10\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0863\nTest evaluation --> mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --> mape: -0.0868\nTest evaluation --> mape: -0.2018\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== >>\nTotal time: 0.040s\n-------------------------------------\nETS                  --> mape: -0.202\nExponentialSmoothing --> mape: -0.2018 !\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.2  ES02      -0.0889    -0.2020  0.017015  0.017015\n     ETS02     -0.0889    -0.2020  0.021020  0.021020\n0.4  ES04      -0.0871    -0.2020  0.018016  0.018016\n     ETS04     -0.0871    -0.2020  0.019017  0.019017\n0.6  ES06      -0.0867    -0.2016  0.017015  0.017015\n     ETS06     -0.0861    -0.2020  0.020019  0.020019\n0.8  ES08      -0.0845    -0.2020  0.018016  0.018016\n     ETS08     -0.0842    -0.2020  0.020018  0.020018\n1.0  ES10      -0.0868    -0.2018  0.018016  0.018016\n     ETS10     -0.0863    -0.2020  0.020018  0.020018\n\n\n>>> print(runner.evaluate())\n\n           mae    mape        mse      r2     rmse\nETS02 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES02  -81.4444 -0.2020 -8673.1766 -0.4208 -93.1299\nETS04 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES04  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS06 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES06  -81.3025 -0.2016 -8645.4416 -0.4162 -92.9809\nETS08 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES08  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS10 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES10  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingforecaster/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingforecaster/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/", "title": "TrainSizingRegressor", "text": "

                                                                                                                                                                                                                                                                                                                                                                  class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.

                                                                                                                                                                                                                                                                                                                                                                  The following steps are applied to every model (per iteration):

                                                                                                                                                                                                                                                                                                                                                                  1. Apply hyperparameter tuning (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. Fit the model on the training set using the best combination of hyperparameters found.
                                                                                                                                                                                                                                                                                                                                                                  3. Evaluate the model on the test set.
                                                                                                                                                                                                                                                                                                                                                                  4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used.

                                                                                                                                                                                                                                                                                                                                                                  metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature function(y_true, y_pred, **kwargs) -> score, a scorer object or a sequence of these. If None, the default metric r2 is selected.

                                                                                                                                                                                                                                                                                                                                                                  train_sizes: int or sequence, default=5 Training set sizes used to run the trainings.

                                                                                                                                                                                                                                                                                                                                                                  • If int: Number of equally distributed splits, i.e., for a value N, it's equal to np.linspace(1.0/N, 1.0, N).
                                                                                                                                                                                                                                                                                                                                                                  • If sequence: Fraction of the training set when <=1, else total number of samples.

                                                                                                                                                                                                                                                                                                                                                                  n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add _fit to the parameter's name to pass it to the estimator's fit method instead of the constructor.

                                                                                                                                                                                                                                                                                                                                                                  ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include:

                                                                                                                                                                                                                                                                                                                                                                  • cv: int, cv-generator, dict or sequence, default=1 Cross-validation object or number of splits. If 1, the data is randomly split in a subtrain and validation set.
                                                                                                                                                                                                                                                                                                                                                                  • plot: bool, dict or sequence, default=False Whether to plot the optimization's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. See the plot_trials method.
                                                                                                                                                                                                                                                                                                                                                                  • distributions: dict, sequence or None, default=None Custom hyperparameter distributions. If None, it uses the model's predefined distributions. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • tags: dict, sequence or None, default=None Custom tags for the model's trial and mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • **kwargs Additional Keyword arguments for the constructor of the study class or the optimize method.

                                                                                                                                                                                                                                                                                                                                                                  n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model.

                                                                                                                                                                                                                                                                                                                                                                  parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using parallel=True turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"raise\": Raise any encountered exception.
                                                                                                                                                                                                                                                                                                                                                                  • \"skip\": Skip a failed model. This model is not accessible after training.
                                                                                                                                                                                                                                                                                                                                                                  • \"keep\": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter optimization after failure without losing previous successful trials.

                                                                                                                                                                                                                                                                                                                                                                  n_jobs: int, default=1 Number of cores to use for parallel processing.

                                                                                                                                                                                                                                                                                                                                                                  • If >0: Number of cores to use.
                                                                                                                                                                                                                                                                                                                                                                  • If -1: Use all available cores.
                                                                                                                                                                                                                                                                                                                                                                  • If <-1: Use number of cores - 1 + n_jobs.

                                                                                                                                                                                                                                                                                                                                                                  device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"data\":

                                                                                                                                                                                                                                                                                                                                                                    • \"numpy\"
                                                                                                                                                                                                                                                                                                                                                                    • \"pyarrow\"
                                                                                                                                                                                                                                                                                                                                                                    • \"modin\"
                                                                                                                                                                                                                                                                                                                                                                  • \"estimator\":

                                                                                                                                                                                                                                                                                                                                                                    • \"sklearn\"
                                                                                                                                                                                                                                                                                                                                                                    • \"sklearnex\"
                                                                                                                                                                                                                                                                                                                                                                    • \"cuml\"

                                                                                                                                                                                                                                                                                                                                                                  backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • \"loky\": Single-node, process-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"multiprocessing\": Legacy single-node, process-based parallelism. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • \"threading\": Single-node, thread-based parallelism.
                                                                                                                                                                                                                                                                                                                                                                  • \"ray\": Multi-node, process-based parallelism.

                                                                                                                                                                                                                                                                                                                                                                  memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  • If False: No caching is performed.
                                                                                                                                                                                                                                                                                                                                                                  • If True: A default temp directory is used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the caching directory.
                                                                                                                                                                                                                                                                                                                                                                  • If Memory: Object with the joblib.Memory interface.

                                                                                                                                                                                                                                                                                                                                                                  verbose: int, default=0 Verbosity level of the class. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • 0 to not print anything.
                                                                                                                                                                                                                                                                                                                                                                  • 1 to print basic information.
                                                                                                                                                                                                                                                                                                                                                                  • 2 to print detailed information.

                                                                                                                                                                                                                                                                                                                                                                  warnings: bool or str, default=False

                                                                                                                                                                                                                                                                                                                                                                  • If True: Default warning action (equal to \"once\").
                                                                                                                                                                                                                                                                                                                                                                  • If False: Suppress all warnings (equal to \"ignore\").
                                                                                                                                                                                                                                                                                                                                                                  • If str: One of python's warnings filters.

                                                                                                                                                                                                                                                                                                                                                                  Changing this parameter affects the PYTHONWarnings environment. ATOM can't manage warnings that go from C/C++ code to stdout.

                                                                                                                                                                                                                                                                                                                                                                  logger: str, Logger or None, default=None

                                                                                                                                                                                                                                                                                                                                                                  • If None: Logging isn't used.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Name of the log file. Use \"auto\" for automatic name.
                                                                                                                                                                                                                                                                                                                                                                  • If Path: A pathlib.Path to the log file.
                                                                                                                                                                                                                                                                                                                                                                  • Else: Python logging.Logger instance.

                                                                                                                                                                                                                                                                                                                                                                  experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

                                                                                                                                                                                                                                                                                                                                                                  random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

                                                                                                                                                                                                                                                                                                                                                                  See Also

                                                                                                                                                                                                                                                                                                                                                                  ATOMRegressor Main class for regression tasks.

                                                                                                                                                                                                                                                                                                                                                                  DirectRegressor Train and evaluate the models in a direct fashion.

                                                                                                                                                                                                                                                                                                                                                                  SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#example", "title": "Example", "text": "
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom.training import TrainSizingRegressor\n>>> from sklearn.datasets import load_digits\n>>> from sklearn.model_selection import train_test_split\n\n>>> X, y = load_digits(return_X_y=True, as_frame=True)\n\n>>> train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n>>> runner = TrainSizingRegressor(models=\"OLS\", verbose=2)\n>>> runner.run(train, test)\n\n\nTraining ========================= >>\nMetric: r2\n\n\nRun: 0 =========================== >>\nModels: OLS02\nSize of training set: 251 (20%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6391\nTest evaluation --> r2: -4.630208907041091e+25\nTime elapsed: 0.148s\n-------------------------------------------------\nTime: 0.148s\n\n\nFinal results ==================== >>\nTotal time: 0.149s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -4.630208907041091e+25 ~\n\n\nRun: 1 =========================== >>\nModels: OLS04\nSize of training set: 502 (40%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6137\nTest evaluation --> r2: -9.496101715653298e+22\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -9.496101715653298e+22 ~\n\n\nRun: 2 =========================== >>\nModels: OLS06\nSize of training set: 754 (60%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.6086\nTest evaluation --> r2: -0.2872\nTime elapsed: 0.151s\n-------------------------------------------------\nTime: 0.151s\n\n\nFinal results ==================== >>\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --> r2: -0.2872 ~\n\n\nRun: 3 =========================== >>\nModels: OLS08\nSize of training set: 1005 (80%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5986\nTest evaluation --> r2: 0.5025\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5025\n\n\nRun: 4 =========================== >>\nModels: OLS10\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.5951\nTest evaluation --> r2: 0.5864\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== >>\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --> r2: 0.5864\n\n\n>>> # Analyze the results\n>>> print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.2  OLS02    0.6391 -4.630209e+25  0.148360  0.148360\n0.4  OLS04    0.6137 -9.496102e+22  0.149996  0.149996\n0.6  OLS06    0.6086 -2.872000e-01  0.151353  0.151353\n0.8  OLS08    0.5986  5.025000e-01  0.149508  0.149508\n1.0  OLS10    0.5951  5.864000e-01  0.149549  0.149549\n\n\n>>> print(runner.evaluate())\n\n                mae          mape           mse            r2          rmse\nOLS02 -1.004380e+12 -7.646687e+14 -3.774343e+26 -4.630209e+25 -1.942767e+13\nOLS04 -5.120843e+10 -8.663629e+14 -7.740805e+23 -9.496102e+22 -8.798184e+11\nOLS06 -1.559600e+00 -7.836450e+14 -1.049240e+01 -2.872000e-01 -3.239200e+00\nOLS08 -1.482200e+00 -8.382465e+14 -4.055100e+00  5.025000e-01 -2.013700e+00\nOLS10 -1.445900e+00 -8.224099e+14 -3.371700e+00  5.864000e-01 -1.836200e+00\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingregressor/#data-attributes", "title": "Data attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

                                                                                                                                                                                                                                                                                                                                                                  Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set.

                                                                                                                                                                                                                                                                                                                                                                  This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#utility-attributes", "title": "Utility attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The utility attributes are used to access information about the models in the instance after training.

                                                                                                                                                                                                                                                                                                                                                                  Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. winner: model | NoneBest performing model.

                                                                                                                                                                                                                                                                                                                                                                  Performance is measured as the highest score on the model's [main_metric]_bootstrap or [main_metric]_test, checked in that order. Ties are resolved looking at the lowest time_fit. results: pd.DataFrameOverview of the training results.

                                                                                                                                                                                                                                                                                                                                                                  All durations are in seconds. Possible values include:

                                                                                                                                                                                                                                                                                                                                                                  • [metric]_ht: Score obtained by the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • time_ht: Duration of the hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_train: Metric score on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_test: Metric score on the test set.
                                                                                                                                                                                                                                                                                                                                                                  • time_fit: Duration of the model fitting on the train set.
                                                                                                                                                                                                                                                                                                                                                                  • [metric]_bootstrap: Mean score on the bootstrapped samples.
                                                                                                                                                                                                                                                                                                                                                                  • time_bootstrap: Duration of the bootstrapping.
                                                                                                                                                                                                                                                                                                                                                                  • time: Total duration of the run.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#plot-attributes", "title": "Plot attributes", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Attributespalette: str | Sequence[str]Color palette.

                                                                                                                                                                                                                                                                                                                                                                  Specify one of plotly's built-in palettes or create a custom one, e.g., atom.palette = [\"red\", \"green\", \"blue\"]. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "API/training/trainsizingregressor/#methods", "title": "Methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  method available_models()[source]Give an overview of the available predefined models.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Information about the available predefined models. Columns include:

                                                                                                                                                                                                                                                                                                                                                                  • acronym: Model's acronym (used to call the model).
                                                                                                                                                                                                                                                                                                                                                                  • model: Name of the model's class.
                                                                                                                                                                                                                                                                                                                                                                  • estimator: The model's underlying estimator.
                                                                                                                                                                                                                                                                                                                                                                  • module: The estimator's module.
                                                                                                                                                                                                                                                                                                                                                                  • needs_scaling: Whether the model requires feature scaling.
                                                                                                                                                                                                                                                                                                                                                                  • accepts_sparse: Whether the model accepts sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • native_multilabel: Whether the model has native support for multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • native_multioutput: Whether the model has native support for multioutput tasks.
                                                                                                                                                                                                                                                                                                                                                                  • has_validation: Whether the model has in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • supports_engines: Engines supported by the model.

                                                                                                                                                                                                                                                                                                                                                                  method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

                                                                                                                                                                                                                                                                                                                                                                  This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: int, default=1 Number of plots in length.

                                                                                                                                                                                                                                                                                                                                                                  cols: int, default=2 Number of plots in width.

                                                                                                                                                                                                                                                                                                                                                                  horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

                                                                                                                                                                                                                                                                                                                                                                  title: str, dict or None, default=None Title for the plot.

                                                                                                                                                                                                                                                                                                                                                                  • If None, no title is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str, text for the title.
                                                                                                                                                                                                                                                                                                                                                                  • If dict, title configuration.

                                                                                                                                                                                                                                                                                                                                                                  legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

                                                                                                                                                                                                                                                                                                                                                                  • If None: No legend is shown.
                                                                                                                                                                                                                                                                                                                                                                  • If str: Location where to show the legend.
                                                                                                                                                                                                                                                                                                                                                                  • If dict: Legend configuration.

                                                                                                                                                                                                                                                                                                                                                                  figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

                                                                                                                                                                                                                                                                                                                                                                  filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

                                                                                                                                                                                                                                                                                                                                                                  display: bool, default=True Whether to render the plot.

                                                                                                                                                                                                                                                                                                                                                                  Yieldsgo.Figure Plot object.

                                                                                                                                                                                                                                                                                                                                                                  method clear()[source]Reset attributes and clear cache from all models.

                                                                                                                                                                                                                                                                                                                                                                  Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation scores
                                                                                                                                                                                                                                                                                                                                                                  • Shap values
                                                                                                                                                                                                                                                                                                                                                                  • App instance
                                                                                                                                                                                                                                                                                                                                                                  • Dashboard instance
                                                                                                                                                                                                                                                                                                                                                                  • Calculated holdout data sets

                                                                                                                                                                                                                                                                                                                                                                  method delete(models=None)[source]Delete models.

                                                                                                                                                                                                                                                                                                                                                                  If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted.

                                                                                                                                                                                                                                                                                                                                                                  method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

                                                                                                                                                                                                                                                                                                                                                                  rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on.

                                                                                                                                                                                                                                                                                                                                                                  threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

                                                                                                                                                                                                                                                                                                                                                                  • The task is binary or multilabel classification.
                                                                                                                                                                                                                                                                                                                                                                  • The model has a predict_proba method.
                                                                                                                                                                                                                                                                                                                                                                  • The metric evaluates predicted probabilities.

                                                                                                                                                                                                                                                                                                                                                                  For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

                                                                                                                                                                                                                                                                                                                                                                  sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

                                                                                                                                                                                                                                                                                                                                                                  Returnspd.DataFrame Scores of the models.

                                                                                                                                                                                                                                                                                                                                                                  method export_pipeline(model=None)[source]Export the internal pipeline.

                                                                                                                                                                                                                                                                                                                                                                  This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

                                                                                                                                                                                                                                                                                                                                                                  ReturnsPipeline Current branch as a sklearn-like Pipeline object.

                                                                                                                                                                                                                                                                                                                                                                  method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.

                                                                                                                                                                                                                                                                                                                                                                  Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights.

                                                                                                                                                                                                                                                                                                                                                                  Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  method get_params(deep=True)[source]Get parameters for this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

                                                                                                                                                                                                                                                                                                                                                                  Returnsparams : dict Parameter names mapped to their values.

                                                                                                                                                                                                                                                                                                                                                                  method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

                                                                                                                                                                                                                                                                                                                                                                  Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

                                                                                                                                                                                                                                                                                                                                                                  Parametersother: Runner Instance with which to merge. Should be of the same class as self.

                                                                                                                                                                                                                                                                                                                                                                  suffix: str, default=\"2\" Branches and models with conflicting names are merged adding suffix to the end of their names.

                                                                                                                                                                                                                                                                                                                                                                  method update_layout(**kwargs)[source]Update the properties of the plot's layout.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original layout with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_layout method.

                                                                                                                                                                                                                                                                                                                                                                  method update_traces(**kwargs)[source]Update the properties of the plot's traces.

                                                                                                                                                                                                                                                                                                                                                                  Recursively update the structure of the original traces with the values in the arguments.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**kwargs Keyword arguments for the figure's update_traces method.

                                                                                                                                                                                                                                                                                                                                                                  method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

                                                                                                                                                                                                                                                                                                                                                                  method run(*arrays)[source]Train and evaluate the models.

                                                                                                                                                                                                                                                                                                                                                                  Read more in the user guide.

                                                                                                                                                                                                                                                                                                                                                                  Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)

                                                                                                                                                                                                                                                                                                                                                                  method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

                                                                                                                                                                                                                                                                                                                                                                  Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming.

                                                                                                                                                                                                                                                                                                                                                                  save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance.

                                                                                                                                                                                                                                                                                                                                                                  method set_params(**params)[source]Set the parameters of this estimator.

                                                                                                                                                                                                                                                                                                                                                                  Parameters**params : dict Estimator parameters.

                                                                                                                                                                                                                                                                                                                                                                  Returnsself : estimator instance Estimator instance.

                                                                                                                                                                                                                                                                                                                                                                  method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

                                                                                                                                                                                                                                                                                                                                                                  method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

                                                                                                                                                                                                                                                                                                                                                                  name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

                                                                                                                                                                                                                                                                                                                                                                  **kwargs Additional keyword arguments for sklearn's voting instance.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v4.x.x/#version-4141", "title": "Version 4.14.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Fixed an installation issue with conda.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4140", "title": "Version 4.14.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the Cleaner and Vectorizer classes.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the cross_validate method.
                                                                                                                                                                                                                                                                                                                                                                  • The plot_pipeline method now supports drawing multiple pipelines.
                                                                                                                                                                                                                                                                                                                                                                  • Renamed the Normalizer class to TextNormalizer.
                                                                                                                                                                                                                                                                                                                                                                  • Renamed the Gauss class to Normalizer.
                                                                                                                                                                                                                                                                                                                                                                  • Added the inverse_transform method to the Scaler, Normalizer and Cleaner classes.
                                                                                                                                                                                                                                                                                                                                                                  • Added the winners property to the trainers (note the extra s).
                                                                                                                                                                                                                                                                                                                                                                  • Added the feature_names_in_ and n_features_in_ attributes to transformers.
                                                                                                                                                                                                                                                                                                                                                                  • The default value of the warnings parameter is set to False.
                                                                                                                                                                                                                                                                                                                                                                  • Improvements for multicollinearity removal in FeatureSelector.
                                                                                                                                                                                                                                                                                                                                                                  • Renamed default feature names to x0, x1, etc... for consistency with sklearn's API.
                                                                                                                                                                                                                                                                                                                                                                  • Renamed component names in FeatureSelector to pca0, pca1, etc... for consistency with sklearn's API.
                                                                                                                                                                                                                                                                                                                                                                  • Significant speed up in pipeline transformations.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where mlflow runs could be ended unexpectedly.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4131", "title": "Version 4.13.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Fixed an installation issue.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4130", "title": "Version 4.13.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Added GPU support. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Added advanced feature selection strategies.
                                                                                                                                                                                                                                                                                                                                                                  • Added the return_sparse parameter to the Vectorizer class.
                                                                                                                                                                                                                                                                                                                                                                  • Added the quantile hyperparameter to the Dummy model.
                                                                                                                                                                                                                                                                                                                                                                  • The data attributes now return pandas objects where possible.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the BO could crash after balancing the data.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where saving the FeatureGenerator class could fail for certain operators.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the FeatureSelector class displayed the wrong output.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the mapping attribute was not reordered.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4120", "title": "Version 4.12.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Support for Python 3.10.
                                                                                                                                                                                                                                                                                                                                                                  • New Discretizer class to bin numerical features.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the FeatureGenerator class.
                                                                                                                                                                                                                                                                                                                                                                  • The mapping attribute now shows all encoded features.
                                                                                                                                                                                                                                                                                                                                                                  • Added the sample_weight parameter to the evaluate method.
                                                                                                                                                                                                                                                                                                                                                                  • ATOMClassifier has now a stratify parameter to split the data sets in a stratified fashion.
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to exclude hyperparameters from the BO adding ! before the name.
                                                                                                                                                                                                                                                                                                                                                                  • Added memory usage to the stats method.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where plot_shap_decision could fail when only one row was plotted.
                                                                                                                                                                                                                                                                                                                                                                  • Added versioning to the documentation.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4110", "title": "Version 4.11.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Full support for sparse matrices. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • The shrink method now also handles sparse features.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the distribution method.
                                                                                                                                                                                                                                                                                                                                                                  • Added three new linear models: Lars, Huber and Perc.
                                                                                                                                                                                                                                                                                                                                                                  • Dimensions can be shared across models using the key 'all' in ht_params[\"dimensions\"].
                                                                                                                                                                                                                                                                                                                                                                  • Assign hyperparameters to tune using the predefined dimensions.
                                                                                                                                                                                                                                                                                                                                                                  • It's now possible to tune a custom number of layers for the MLP model.
                                                                                                                                                                                                                                                                                                                                                                  • If multiple BO calls share the best score, the one with the shortest training time is selected as winner (instead of the first).
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the BO could fail when custom dimensions where defined.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where FeatureSelector could fail after repeated calls to fit.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where FeatureGenerator didn't pass the correct data indices to its output.
                                                                                                                                                                                                                                                                                                                                                                  • Performance improvements for the custom pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • Minor documentation fixes.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-4100", "title": "Version 4.10.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Added the holdout data set to have an extra way of assessing a model's performance on a completely independent dataset. Read more in the user_guide.
                                                                                                                                                                                                                                                                                                                                                                  • Complete rework of the ensemble models.
                                                                                                                                                                                                                                                                                                                                                                  • Support for dataframe indexing. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • New plot_parshap plot to detect overfitting features.
                                                                                                                                                                                                                                                                                                                                                                  • The new create_dashboard method makes analyzing the models even easier using a dashboard app.
                                                                                                                                                                                                                                                                                                                                                                  • The plot_feature_importance plot now also accepts estimators with coefficients.
                                                                                                                                                                                                                                                                                                                                                                  • Added the transform method for models.
                                                                                                                                                                                                                                                                                                                                                                  • Added the threshold parameter to the evaluate method.
                                                                                                                                                                                                                                                                                                                                                                  • The reset_predictions method is deprecated in favour of the new clear method.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the model's full_train method.
                                                                                                                                                                                                                                                                                                                                                                  • The merge method is available for all trainers.
                                                                                                                                                                                                                                                                                                                                                                  • Improvements in the trainer's pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • Training scores are now also saved to the mlflow run.
                                                                                                                                                                                                                                                                                                                                                                  • Trying to change the data in a branch after fitting a model with it now raises an exception.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the columns of array inputs were not ordered correctly.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where branches did not correctly act case-insensitive.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the export_pipeline method for models would not export the transformers in the correct branch.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-491", "title": "Version 4.9.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Changed the default cross-validation for hyperparameter tuning from 5 to 1 to avoid errors with deep learning models.
                                                                                                                                                                                                                                                                                                                                                                  • Added clearer exception messages when a model's run failed.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where custom dimensions didn't show during hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • Documentation improvements.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-490", "title": "Version 4.9.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Drop support of Python 3.6.
                                                                                                                                                                                                                                                                                                                                                                  • Added the HistGBM model.
                                                                                                                                                                                                                                                                                                                                                                  • Improved print layout for hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • The new available_models method returns an overview of the available predefined models.
                                                                                                                                                                                                                                                                                                                                                                  • The calibrate and cross_validate methods can no longer be accessed from the trainers.
                                                                                                                                                                                                                                                                                                                                                                  • The pipeline parameter for the prediction methods is deprecated.
                                                                                                                                                                                                                                                                                                                                                                  • Improved visualization of the plot_rfecv, plot_successive_halving and plot_learning_curve methods.
                                                                                                                                                                                                                                                                                                                                                                  • Sparse matrices are now accepted as input.
                                                                                                                                                                                                                                                                                                                                                                  • Duplicate BO calls are no longer calculated.
                                                                                                                                                                                                                                                                                                                                                                  • Improvement in performance of the RNN model.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the model's bo attribute.
                                                                                                                                                                                                                                                                                                                                                                  • Predefined hyperparameters have been updated to be consistent with sklearn's API.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where custom scalers were ignored by the models.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the BO of certain models would crash with custom hyperparameters.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where duplicate column names could be generated from a custom transformer.
                                                                                                                                                                                                                                                                                                                                                                  • Documentation improvements.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-480", "title": "Version 4.8.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • The Encoder class now directly handles unknown categories encountered during fitting.
                                                                                                                                                                                                                                                                                                                                                                  • The Balancerand Encoder classes now accept custom estimators for the strategy parameter.
                                                                                                                                                                                                                                                                                                                                                                  • The new merge method enables the user to merge multiple atom instances into one.
                                                                                                                                                                                                                                                                                                                                                                  • The dtype shrinking is moved from atom's initializers to the shrink method.
                                                                                                                                                                                                                                                                                                                                                                  • ATOM's custom pipeline now handles transformers fitted on a subset of the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • The column parameter in the distribution method is renamed to columns for continuity of the API.
                                                                                                                                                                                                                                                                                                                                                                  • The mae criterion for the GBM model hyperparameter tuning is deprecated to be consistent with sklearn's API.
                                                                                                                                                                                                                                                                                                                                                                  • Branches are now case-insensitive.
                                                                                                                                                                                                                                                                                                                                                                  • Renaming a branch using an existing name now raises an exception.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where columns of type category broke the Imputer class.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where predictions of the Stacking ensemble crashed for branches with multiple transformers.
                                                                                                                                                                                                                                                                                                                                                                  • The tables in the documentation now adapt to dark mode.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-473", "title": "Version 4.7.3", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the conda-forge recipe couldn't install properly.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-472", "title": "Version 4.7.2", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the pipeline failed for custom transformers that returned sparse matrices.
                                                                                                                                                                                                                                                                                                                                                                  • Package requirements files are added to the installer.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-471", "title": "Version 4.7.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the pip installer failed.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where categorical columns also selected datetime columns.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-470", "title": "Version 4.7.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Launched our new slack channel!
                                                                                                                                                                                                                                                                                                                                                                  • The new FeatureExtractor class extracts useful features from datetime columns.
                                                                                                                                                                                                                                                                                                                                                                  • The new plot_det method plots a binary classifier's detection error tradeoff curve.
                                                                                                                                                                                                                                                                                                                                                                  • The plot_partial_dependence is able to draw Individual Conditional Expectation (ICE) lines.
                                                                                                                                                                                                                                                                                                                                                                  • The full traceback of exceptions encountered during training are now saved to the logger.
                                                                                                                                                                                                                                                                                                                                                                  • ATOMClassifier and ATOMRegressor now convert the dtypes of the input data to the minimal allowed type for memory efficiency.
                                                                                                                                                                                                                                                                                                                                                                  • The scoring method is renamed to evaluate to clarify its purpose.
                                                                                                                                                                                                                                                                                                                                                                  • The column parameter in the apply method is renamed to columns for continuity of the API.
                                                                                                                                                                                                                                                                                                                                                                  • Minor documentation improvements.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-460", "title": "Version 4.6.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Added the full_train method to retrieve an estimator trained on the complete dataset.
                                                                                                                                                                                                                                                                                                                                                                  • The score method is now also able to calculate custom metrics on new data.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the Imputer class.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the Encoder class to avoid errors for unknown classes and allow the input of missing values.
                                                                                                                                                                                                                                                                                                                                                                  • The clean method no longer automatically encodes the target column for regression tasks.
                                                                                                                                                                                                                                                                                                                                                                  • Creating a branch using a models' acronym as name now raises an exception.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where CatBoost failed when early_stopping < 1.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where created pipelines had duplicated names.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-450", "title": "Version 4.5.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Support of NLP pipelines. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Integration of mlflow to track all models in the pipeline. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • The new Normalizer class transforms features to a more Gaussian-like distribution.
                                                                                                                                                                                                                                                                                                                                                                  • New cross_validate method to evaluate the robustness of a pipeline using cross_validation.
                                                                                                                                                                                                                                                                                                                                                                  • New reset method to go back to atom's initial state.
                                                                                                                                                                                                                                                                                                                                                                  • Added the Dummy model to compare other models with a simple baseline.
                                                                                                                                                                                                                                                                                                                                                                  • New plot_wordcloud and plot_ngrams methods for text visualization.
                                                                                                                                                                                                                                                                                                                                                                  • Plots now can return the figure object when display=None.
                                                                                                                                                                                                                                                                                                                                                                  • The Pruner class can now able to drop outliers based on the selection of multiple strategies.
                                                                                                                                                                                                                                                                                                                                                                  • The new shuffle parameter in atom's initializer determines whether to shuffle the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • The trainers no longer require you to specify a model using the models parameter. If left to default, all predefined models for that task are used.
                                                                                                                                                                                                                                                                                                                                                                  • The apply method now accepts args and kwargs for the function.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the evaluate method.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the export_pipeline method.
                                                                                                                                                                                                                                                                                                                                                                  • The parameters in the Cleaner class have been refactored to better describe their function.
                                                                                                                                                                                                                                                                                                                                                                  • The train_sizes parameter in train_sizing now accepts integer values to automatically create equally distributed splits in the training set.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of plot_pipeline to show models in the diagram as well.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the bagging parameter to the (more appropriate) name n_bootstrap.
                                                                                                                                                                                                                                                                                                                                                                  • New option to exclude columns from a transformer adding ! before their name.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the Pruner class failed if there were categorical columns in the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • Completely reworked documentation website.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-440", "title": "Version 4.4.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • New apply method to perform data transformations as function to the pipeline
                                                                                                                                                                                                                                                                                                                                                                  • Added the status method to save an overview of atom's branches and models to the logger.
                                                                                                                                                                                                                                                                                                                                                                  • Improved the output messages for the Imputer class.
                                                                                                                                                                                                                                                                                                                                                                  • The dataset's columns can now be called directly from atom.
                                                                                                                                                                                                                                                                                                                                                                  • The distribution and plot_distribution methods now ignore missing values.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where transformations could fail when columns were added to the dataset after initializing the pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the Cleaner class didn't drop columns consisting entirely of missing values when drop_min_cardinality=True.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the winning model wasn't displayed correctly.
                                                                                                                                                                                                                                                                                                                                                                  • Refactored the way transformers are added or removed from predicting methods.
                                                                                                                                                                                                                                                                                                                                                                  • Improved documentation.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-430", "title": "Version 4.3.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to add custom transformers to the pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • The export_pipeline utility method exports atom's current pipeline to a sklearn object.
                                                                                                                                                                                                                                                                                                                                                                  • New magic methods makes atom behave similarly to sklearn's Pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • All training approaches can now be combined in the same atom instance.
                                                                                                                                                                                                                                                                                                                                                                  • New plot_relationships, plot_distribution and plot_qq plots for data inspection.
                                                                                                                                                                                                                                                                                                                                                                  • Complete rework of all the shap plots to be consistent with their new API.
                                                                                                                                                                                                                                                                                                                                                                  • Improvements for the Scaler and [Pruner]([] classes.
                                                                                                                                                                                                                                                                                                                                                                  • The acronym for custom models now defaults to the capital letters in the class' __name__.
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to apply transformations on only a subset of the columns.
                                                                                                                                                                                                                                                                                                                                                                  • Plots and methods now accept winner as model name.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where custom metrics didn't show the correct name.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where timers were not displayed correctly.
                                                                                                                                                                                                                                                                                                                                                                  • Further compatibility with deep learning datasets.
                                                                                                                                                                                                                                                                                                                                                                  • Large refactoring for performance optimization.
                                                                                                                                                                                                                                                                                                                                                                  • Cleaner output of messages to the logger.
                                                                                                                                                                                                                                                                                                                                                                  • Plots no longer show a default title.
                                                                                                                                                                                                                                                                                                                                                                  • Minor bug fixes.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-421", "title": "Version 4.2.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Bug fix where there was memory leakage in successive halving and train sizing pipelines.
                                                                                                                                                                                                                                                                                                                                                                  • Improved documentation.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-420", "title": "Version 4.2.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to add custom models to the pipeline using ATOMModel.
                                                                                                                                                                                                                                                                                                                                                                  • Compatibility with deep learning models.
                                                                                                                                                                                                                                                                                                                                                                  • New branch system for different data pipelines. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Use the canvas contextmanager to draw multiple plots in one figure.
                                                                                                                                                                                                                                                                                                                                                                  • New voting and stacking ensemble techniques.
                                                                                                                                                                                                                                                                                                                                                                  • New get_class_weight utility method.
                                                                                                                                                                                                                                                                                                                                                                  • New Sequential Feature Selection strategy for the FeatureSelector.
                                                                                                                                                                                                                                                                                                                                                                  • Added the sample_weight parameter to the score method.
                                                                                                                                                                                                                                                                                                                                                                  • New ways to initialize the data in the training instances.
                                                                                                                                                                                                                                                                                                                                                                  • The test_size parameter now also allows integer values.
                                                                                                                                                                                                                                                                                                                                                                  • Renamed categories to classes to be consistent with sklearn's API.
                                                                                                                                                                                                                                                                                                                                                                  • The class property now returns a pd.DataFrame of the number of rows per target class in the train, test and complete dataset.
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to add custom parameters to an estimator's fit method through est_params.
                                                                                                                                                                                                                                                                                                                                                                  • The successive halving and train sizing approaches now both allow subsequent runs from atom without losing the information from previous runs.
                                                                                                                                                                                                                                                                                                                                                                  • Bug fix where ATOMLoader wouldn't encode the target column during transformation.
                                                                                                                                                                                                                                                                                                                                                                  • Added the Deep learning, Ensembles and Utilities example notebooks.
                                                                                                                                                                                                                                                                                                                                                                  • Support for python 3.9.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-410", "title": "Version 4.1.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • New est_params parameter to customize the parameters in every model's estimator.
                                                                                                                                                                                                                                                                                                                                                                  • Following skopt's API, the n_random_starts parameter to specify the number of random trials is deprecated in favour of n_initial_points.
                                                                                                                                                                                                                                                                                                                                                                  • The Balancer class now allows you to use any of the strategies from imblearn.
                                                                                                                                                                                                                                                                                                                                                                  • New utility attributes to inspect the dataset.
                                                                                                                                                                                                                                                                                                                                                                  • Four new models: CatNB, CNB, ARD and RNN.
                                                                                                                                                                                                                                                                                                                                                                  • Added the models section to the documentation.
                                                                                                                                                                                                                                                                                                                                                                  • Small changes in log outputs.
                                                                                                                                                                                                                                                                                                                                                                  • Bug fixes and performance improvements.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-401", "title": "Version 4.0.1", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Bug fix where the FeatureGenerator was not deterministic for a fixed random state.
                                                                                                                                                                                                                                                                                                                                                                  • Bug fix where subsequent runs with the same metric failed.
                                                                                                                                                                                                                                                                                                                                                                  • Added the license file to the package's installer.
                                                                                                                                                                                                                                                                                                                                                                  • Typo fixes in documentation.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v4.x.x/#version-400", "title": "Version 4.0.0", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Bayesian optimization package changed from GpyOpt to skopt.
                                                                                                                                                                                                                                                                                                                                                                  • Complete revision of the model's hyperparameters.
                                                                                                                                                                                                                                                                                                                                                                  • Four SHAP plots can now be called directly from an ATOM pipeline.
                                                                                                                                                                                                                                                                                                                                                                  • Two new plots for regression tasks.
                                                                                                                                                                                                                                                                                                                                                                  • New plot_pipeline and pipeline attribute to access all transformers.
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to determine transformer parameters per method.
                                                                                                                                                                                                                                                                                                                                                                  • New calibrate and plot_calibration methods.
                                                                                                                                                                                                                                                                                                                                                                  • Metrics can now be added as scorers or functions with signature metric(y, y_pred, **kwargs).
                                                                                                                                                                                                                                                                                                                                                                  • Implementation of multi-metric runs.
                                                                                                                                                                                                                                                                                                                                                                  • Possibility to choose which metric to plot.
                                                                                                                                                                                                                                                                                                                                                                  • Early stopping for models that allow in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • Added the ATOMLoader function to load any saved pickle instance.
                                                                                                                                                                                                                                                                                                                                                                  • The \"remove\" strategy in the data cleaning parameters is deprecated in favour of \"drop\".
                                                                                                                                                                                                                                                                                                                                                                  • Implemented the dfs strategy in FeatureGenerator.
                                                                                                                                                                                                                                                                                                                                                                  • All training classes now inherit from BaseEstimator.
                                                                                                                                                                                                                                                                                                                                                                  • Added multiple new example notebooks.
                                                                                                                                                                                                                                                                                                                                                                  • Tests coverage up to 100%.
                                                                                                                                                                                                                                                                                                                                                                  • Completely new documentation page.
                                                                                                                                                                                                                                                                                                                                                                  • Bug fixes and performance improvements.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v5.x.x/#version-600", "title": "Version 6.0.0", "text": "

                                                                                                                                                                                                                                                                                                                                                                  New features

                                                                                                                                                                                                                                                                                                                                                                  • Completely new module for time series. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Support for Python 3.11 and drop support for Python 3.8 and Python 3.9.
                                                                                                                                                                                                                                                                                                                                                                  • New data engines. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Improved memory optimizations. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • Added the iterative strategy for numerical imputation.
                                                                                                                                                                                                                                                                                                                                                                  • New update_traces method to further customize your plots.

                                                                                                                                                                                                                                                                                                                                                                  API changes

                                                                                                                                                                                                                                                                                                                                                                  • The FeatureGrouper class no longer accepts a name parameter. Provide the group names directly through the group parameter as dict.
                                                                                                                                                                                                                                                                                                                                                                  • Rework of the register method.
                                                                                                                                                                                                                                                                                                                                                                  • The multioutput attribute is deprecated. Multioutput meta-estimators are now assigned automatically.
                                                                                                                                                                                                                                                                                                                                                                  • Model tags have to be separated from the acronym by an underscore.
                                                                                                                                                                                                                                                                                                                                                                  • The engine parameter is now a dict.
                                                                                                                                                                                                                                                                                                                                                                  • The automl method is deprecated.

                                                                                                                                                                                                                                                                                                                                                                  Enhancements

                                                                                                                                                                                                                                                                                                                                                                  • Transformations only on y are now accepted, e.g., atom.scale(columns=-1).
                                                                                                                                                                                                                                                                                                                                                                  • Full support for pandas nullable dtypes.
                                                                                                                                                                                                                                                                                                                                                                  • The dataset can now be provided as callable.
                                                                                                                                                                                                                                                                                                                                                                  • The save and save_data methods now accept pathlib.Path objects as filename.
                                                                                                                                                                                                                                                                                                                                                                  • Cleaner representation on hover for the plot_timeline method.
                                                                                                                                                                                                                                                                                                                                                                  • Added the hdbscan strategy to the Pruner class.
                                                                                                                                                                                                                                                                                                                                                                  • The cv key in ht_params now accepts a custom cross-validation generator.
                                                                                                                                                                                                                                                                                                                                                                  • Improved error message for incorrect stratification of multioutput datasets.
                                                                                                                                                                                                                                                                                                                                                                  • Rework of the shrink method.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the cross_validate method could fail for pipelines that changed the number of rows.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the Pruner class didn't drop all outlier clusters.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the pipeline could fail for transformers that returned a series.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the pipeline could fail for transformers that reset its internal attributes during fitting.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the register method failed in Databricks.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where tuning hyperparameter for a base_estimator inside a custom meta-estimator would fail.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the data properties' @setter could fail for numpy arrays.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-520", "title": "Version 5.2.0", "text": "

                                                                                                                                                                                                                                                                                                                                                                  New features

                                                                                                                                                                                                                                                                                                                                                                  • Two new plot methods: plot_terminator_improvement and plot_timeline.

                                                                                                                                                                                                                                                                                                                                                                  Enhancements

                                                                                                                                                                                                                                                                                                                                                                  • Data splits in every trial are now properly stratified according to the selected strategy.
                                                                                                                                                                                                                                                                                                                                                                  • Performance optimization for multiple methods using smart caching.
                                                                                                                                                                                                                                                                                                                                                                  • Improved visualizations for plots with logarithmic hyperparameters.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where parameters in a trial would not match with those displayed.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-512", "title": "Version 5.1.2", "text": "

                                                                                                                                                                                                                                                                                                                                                                  API changes

                                                                                                                                                                                                                                                                                                                                                                  • The default strategy for the encode method has changed from \"LeaveOneOut\" to \"Target\"-encoding. LeaveOneOut is no longer a supported strategy.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where stratification failed for datasets where the target column was not placed last.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where transformers with no get_feature_names_out method could fail.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the FeatureSelector class could fail when transforming a dataset with different column order than seen at fit time.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-511", "title": "Version 5.1.1", "text": "

                                                                                                                                                                                                                                                                                                                                                                  API changes

                                                                                                                                                                                                                                                                                                                                                                  • The infrequent_to_value parameter in the Encoder class is replaced with infrequent_to_value to be consistent with sklearn's naming convention.

                                                                                                                                                                                                                                                                                                                                                                  Enhancements

                                                                                                                                                                                                                                                                                                                                                                  • Added the kwargs parameter to the save_data method.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed an installation issue for systems without an x86 architecture.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where Voting would fail for certain metrics.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where the time metric in mlflow was always zero.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where shap plots wouldn't display the full column names.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where column names where not properly propagated during transformation.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-510", "title": "Version 5.1.0", "text": "

                                                                                                                                                                                                                                                                                                                                                                  New features

                                                                                                                                                                                                                                                                                                                                                                  • Support for multilabel classification, multiclass-multilabel classification and multioutput regression tasks. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • New backend parameter to choose a parallel execution backend.
                                                                                                                                                                                                                                                                                                                                                                  • New parallel parameter to train multiple models simultaneously.
                                                                                                                                                                                                                                                                                                                                                                  • Integration with DAGsHub to store your mlflow experiments. Read more in the user guide.
                                                                                                                                                                                                                                                                                                                                                                  • New serve method to deploy models to a rest API endpoint.
                                                                                                                                                                                                                                                                                                                                                                  • New get_best_threshold method to calculate the optimal threshold for binary and multilabel tasks.
                                                                                                                                                                                                                                                                                                                                                                  • New get_sample_weight method to calculate the sample weights for a balanced data set.

                                                                                                                                                                                                                                                                                                                                                                  API changes

                                                                                                                                                                                                                                                                                                                                                                  • The ATOMLoader class is deprecated in favor of the load method.
                                                                                                                                                                                                                                                                                                                                                                  • The errors attribute for runners is deprecated.

                                                                                                                                                                                                                                                                                                                                                                  Enhancements

                                                                                                                                                                                                                                                                                                                                                                  • Added three new notebook examples.
                                                                                                                                                                                                                                                                                                                                                                  • Added the drop_chars parameter to the Cleaner class.
                                                                                                                                                                                                                                                                                                                                                                  • Added the errors parameter to the trainers.
                                                                                                                                                                                                                                                                                                                                                                  • Rework of the dependencies, making the base package more lightweight.
                                                                                                                                                                                                                                                                                                                                                                  • The logging entries for external libraries are redirected to atom's file handler.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed multiple errors that appeared after sklearn's 1.2 update.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where hyperparameter tuning could fail for multi-metric runs.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where trials would try to report multiple times the same step.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where custom models could skip in-training validation.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed an issue where the bootstrapping estimators were trained using partial_fit.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-501", "title": "Version 5.0.1", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • Fixed installation issue.
                                                                                                                                                                                                                                                                                                                                                                  • Updated package dependencies.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "changelog/v5.x.x/#version-500", "title": "Version 5.0.0", "text": "

                                                                                                                                                                                                                                                                                                                                                                  New features

                                                                                                                                                                                                                                                                                                                                                                  • Completely new hyperparameter tuning process.
                                                                                                                                                                                                                                                                                                                                                                  • Completely reworked plotting interface.
                                                                                                                                                                                                                                                                                                                                                                  • Accelerate your pipelines with sklearnex.
                                                                                                                                                                                                                                                                                                                                                                  • New FeatureGrouper class to extract statistical features from similar groups.
                                                                                                                                                                                                                                                                                                                                                                  • New create_app method to create a nice front-end for model predictions.
                                                                                                                                                                                                                                                                                                                                                                  • New inverse_transform method for atom and models.
                                                                                                                                                                                                                                                                                                                                                                  • New linear model: OrthogonalMatchingPursuit.
                                                                                                                                                                                                                                                                                                                                                                  • The plot_results method now accepts time metrics.

                                                                                                                                                                                                                                                                                                                                                                  API changes

                                                                                                                                                                                                                                                                                                                                                                  • The gpu parameter is deprecated in favor of device and engine.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the Cleaner, Discretizer, Encoder and FeatureSelector classes.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of all shap plots.
                                                                                                                                                                                                                                                                                                                                                                  • Refactor of the apply method.
                                                                                                                                                                                                                                                                                                                                                                  • The plot_scatter_matrix method is renamed to plot_relationships.
                                                                                                                                                                                                                                                                                                                                                                  • The kSVM model is renamed to SVM.
                                                                                                                                                                                                                                                                                                                                                                  • Multidimensional datasets are no longer supported. Check the deep learning section of the user guide for guidance with such datasets.
                                                                                                                                                                                                                                                                                                                                                                  • The greater_is_better, needs_proba and needs_threshold parameters are deprecated. Metric functions are now created using make_scorer's default parameters.
                                                                                                                                                                                                                                                                                                                                                                  • The drop method is removed from atom. Use the reworked apply method instead.
                                                                                                                                                                                                                                                                                                                                                                  • The prediction methods can no longer be called from atom.
                                                                                                                                                                                                                                                                                                                                                                  • The dashboard method for models is now called create_dashboard.

                                                                                                                                                                                                                                                                                                                                                                  Enhancements

                                                                                                                                                                                                                                                                                                                                                                  • New examples for plotting, automated feature scaling, pruning and advanced hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • The Normalizer class can now be accelerated with GPU.
                                                                                                                                                                                                                                                                                                                                                                  • The Scaler class now ignores binary columns (only 0s and 1s).
                                                                                                                                                                                                                                                                                                                                                                  • The models parameter in plot and utility methods now accepts model indices.
                                                                                                                                                                                                                                                                                                                                                                  • The transform method now also transforms only y when X has a default value.
                                                                                                                                                                                                                                                                                                                                                                  • The prediction methods now return pandas objects.
                                                                                                                                                                                                                                                                                                                                                                  • Dependency versions are checked with originals after unpickling.
                                                                                                                                                                                                                                                                                                                                                                  • Automatic generation of documentation from docstrings.
                                                                                                                                                                                                                                                                                                                                                                  • Improvements in documentation display for mobile phones.
                                                                                                                                                                                                                                                                                                                                                                  • New feature_importance attribute for models.
                                                                                                                                                                                                                                                                                                                                                                  • Added a visualization for automated feature scaling to plot_pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Bug fixes

                                                                                                                                                                                                                                                                                                                                                                  • The FeatureExtractor class no longer raises a warning for highly fragmented dataframes.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where models could not call the score function.
                                                                                                                                                                                                                                                                                                                                                                  • The Encoder class no longer fails when the user provides ordinal values that are not present during fitting.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug with the max_nan_rows parameter in the Imputer class.
                                                                                                                                                                                                                                                                                                                                                                  • Fixed a bug where Tokenizer could fail when no ngrams were found.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/accelerating_cuml/", "title": "Accelerating cuml", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n\n# Create a dummy dataset\nX, y = make_classification(n_samples=100000, n_features=40)\n
                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMClassifier from sklearn.datasets import make_classification # Create a dummy dataset X, y = make_classification(n_samples=100000, n_features=40) In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\nAlgorithm task: binary classification.\nGPU training enabled.\nBackend engine: cuml.\n\nDataset stats ==================== >>\nShape: (100000, 41)\nMemory: 32.80 MB\nScaled: True\nOutlier values: 8127 (0.2%)\n-------------------------------------\nTrain set size: 80000\nTest set size: 20000\n-------------------------------------\n|   |       dataset |         train |          test |\n| - | ------------- | ------------- | ------------- |\n| 0 |   50006 (1.0) |   40005 (1.0) |   10001 (1.0) |\n| 1 |   49994 (1.0) |   39995 (1.0) |    9999 (1.0) |\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.scale()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.scale()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset\n
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset Out[13]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x31 x32 x33 x34 x35 x36 x37 x38 x39 target 0 2.021646 -0.634557 -0.867811 1.103642 1.559011 0.122284 -0.864821 1.411657 0.147997 -2.269082 ... -0.489864 1.861048 -0.353861 0.720823 -1.522117 -0.737707 -1.573936 -0.832174 0.203154 0 1 -0.019885 0.846568 -0.364059 -1.091604 -1.336692 0.186689 -0.274142 0.020563 0.693235 -1.908658 ... -1.610058 -0.365231 0.284908 0.170156 -0.236553 -0.573761 -0.107317 -2.480178 0.420341 0 2 0.516618 -0.013420 -0.753879 -0.488243 0.560051 0.395817 -0.522523 -1.083503 -0.073398 0.383061 ... 0.966283 1.405546 -0.658654 0.339090 -1.615997 -1.312444 0.984578 0.602858 -1.110684 1 3 0.111861 -0.966334 0.208509 0.494328 -0.766835 -0.003399 -0.500449 -0.530622 -0.481663 -1.146132 ... -0.304896 2.030211 -1.189488 -1.238600 1.658765 -0.255644 0.572194 0.195496 0.617734 1 4 0.160135 -0.873517 0.719142 -2.020767 0.421435 -1.941230 0.835615 -1.178845 0.235273 -0.328574 ... 1.633662 -0.631118 1.814046 1.031754 0.328665 1.704483 2.153710 -1.430552 -0.543915 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 1.100240 0.092581 -0.346265 0.234024 0.590199 0.755019 -1.688456 -1.031070 -0.620193 -0.283336 ... 0.356480 1.346821 -0.299087 2.343587 -2.003646 -0.933179 0.764255 -0.233526 -1.462311 1 99996 -1.142596 0.321843 -0.974006 0.390418 0.404722 -0.324256 -0.288176 1.009458 0.860912 -0.191313 ... 0.044618 -2.030135 1.448640 -0.854798 1.441451 1.347461 -0.937607 0.572504 -0.787673 0 99997 1.658252 0.303637 -0.020324 0.225917 0.154092 -1.208507 -0.199919 1.063016 -0.395696 -0.060886 ... 1.563345 -1.261853 -0.810122 -0.503823 1.565602 -1.264792 -0.591644 1.588397 0.601721 0 99998 -0.288042 -1.139792 1.548338 0.501413 0.361604 -0.315720 -0.564607 1.500870 0.501768 0.649079 ... 0.344663 1.734476 0.660177 0.767554 1.461940 0.310189 -1.469978 0.900132 1.114330 0 99999 -3.093351 -0.636463 -0.449575 1.169980 -1.041870 -0.257173 2.072777 -0.101111 -0.956916 -0.251162 ... 2.250647 0.746250 -0.610311 0.445467 -0.636288 -0.187444 0.226108 -0.186927 -1.024960 1

                                                                                                                                                                                                                                                                                                                                                                  100000 rows \u00d7 41 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  print(f\"Scaler used: {atom.standard}\")\nprint(f\"Scaler's module: {atom.standard.__class__.__module__}\")\n
                                                                                                                                                                                                                                                                                                                                                                  print(f\"Scaler used: {atom.standard}\") print(f\"Scaler's module: {atom.standard.__class__.__module__}\")
                                                                                                                                                                                                                                                                                                                                                                  Scaler used: StandardScaler()\nScaler's module: cuml._thirdparty.sklearn.preprocessing._data\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"RF\", \"SGD\", \"XGB\"])\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"RF\", \"SGD\", \"XGB\"])
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF, SGD, XGB\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9726\nTest evaluation --> f1: 0.9431\nTime elapsed: 1.935s\n-------------------------------------------------\nTotal time: 1.935s\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9236\nTest evaluation --> f1: 0.9219\nTime elapsed: 02m:16s\n-------------------------------------------------\nTotal time: 02m:16s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9749\nTest evaluation --> f1: 0.9437\nTime elapsed: 6.394s\n-------------------------------------------------\nTotal time: 6.394s\n\n\nFinal results ==================== >>\nTotal time: 02m:24s\n-------------------------------------\nRandomForest              --> f1: 0.9431\nStochasticGradientDescent --> f1: 0.9219\nXGBoost                   --> f1: 0.9437 !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                                                                                                                                                                                                  atom.results Out[6]: score_train score_test time_fit time RF 0.9726 0.9431 1.934512 1.934512 SGD 0.9236 0.9219 135.871493 135.871493 XGB 0.9749 0.9437 6.394416 6.394416 In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  for m in atom.models:\n    print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")\n
                                                                                                                                                                                                                                                                                                                                                                  for m in atom.models: print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")
                                                                                                                                                                                                                                                                                                                                                                  RF's module: cuml.ensemble.randomforestclassifier\nSGD's module: sklearn.linear_model._stochastic_gradient\nXGB's module: xgboost.sklearn\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate() Out[8]: accuracy average_precision balanced_accuracy f1 jaccard matthews_corrcoef precision recall roc_auc RF 0.9429 0.9741 0.9429 0.9431 0.8924 0.8858 0.9391 0.9472 0.9792 SGD 0.9217 0.9635 0.9218 0.9219 0.8551 0.8435 0.9203 0.9235 0.9676 XGB 0.9434 0.9753 0.9434 0.9437 0.8933 0.8868 0.9385 0.9489 0.9798"}, {"location": "examples/accelerating_cuml/#example-accelerating-pipelines-on-gpu", "title": "Example: Accelerating pipelines on GPU\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to accelerate a pipeline on GPU using cuML.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_classification function.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/accelerating_sklearnex/", "title": "Accelerating sklearnex", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Impute missing values and encode categorical columns\natom.impute()\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  # Impute missing values and encode categorical columns atom.impute() atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 637 samples due to missing values in feature MinTemp.\n --> Dropping 322 samples due to missing values in feature MaxTemp.\n --> Dropping 1406 samples due to missing values in feature Rainfall.\n --> Dropping 60843 samples due to missing values in feature Evaporation.\n --> Dropping 67816 samples due to missing values in feature Sunshine.\n --> Dropping 9330 samples due to missing values in feature WindGustDir.\n --> Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --> Dropping 10013 samples due to missing values in feature WindDir9am.\n --> Dropping 3778 samples due to missing values in feature WindDir3pm.\n --> Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --> Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 1774 samples due to missing values in feature Humidity9am.\n --> Dropping 3610 samples due to missing values in feature Humidity3pm.\n --> Dropping 14014 samples due to missing values in feature Pressure9am.\n --> Dropping 13981 samples due to missing values in feature Pressure3pm.\n --> Dropping 53657 samples due to missing values in feature Cloud9am.\n --> Dropping 57094 samples due to missing values in feature Cloud3pm.\n --> Dropping 904 samples due to missing values in feature Temp9am.\n --> Dropping 2726 samples due to missing values in feature Temp3pm.\n --> Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Train a K-Nearest Neighbors model (using default sklearn)\natom.run(models=\"KNN\", metric=\"f1\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Train a K-Nearest Neighbors model (using default sklearn) atom.run(models=\"KNN\", metric=\"f1\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7135\nTest evaluation --> f1: 0.5904\nTime elapsed: 4.239s\n-------------------------------------------------\nTime: 4.239s\n\n\nFinal results ==================== >>\nTotal time: 8.264s\n-------------------------------------\nKNearestNeighbors --> f1: 0.5904\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Now, we train an accelerated KNN using engine=\"sklearnex\"\n# Note the diffrence in training speed!!\natom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})\n
                                                                                                                                                                                                                                                                                                                                                                  # Now, we train an accelerated KNN using engine=\"sklearnex\" # Note the diffrence in training speed!! atom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: KNN_acc\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.7135\nTest evaluation --> f1: 0.5904\nTime elapsed: 1.185s\n-------------------------------------------------\nTime: 1.185s\n\n\nFinal results ==================== >>\nTotal time: 2.226s\n-------------------------------------\nKNearestNeighbors --> f1: 0.5904\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                                                                                                                                                                                                  atom.results Out[8]: f1_train f1_test time_fit time KNN 0.7135 0.5904 4.238729 4.238729 KNN_acc 0.7135 0.5904 1.184578 1.184578 In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note how the underlying estimators might look the same...\nprint(atom.knn.estimator)\nprint(atom.knn_acc.estimator)\n\n# ... but are using different implementations\nprint(atom.knn.estimator.__module__)\nprint(atom.knn_acc.estimator.__module__)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note how the underlying estimators might look the same... print(atom.knn.estimator) print(atom.knn_acc.estimator) # ... but are using different implementations print(atom.knn.estimator.__module__) print(atom.knn_acc.estimator.__module__)
                                                                                                                                                                                                                                                                                                                                                                  KNeighborsClassifier(n_jobs=1)\nKNeighborsClassifier(n_jobs=1)\nsklearn.neighbors._classification\nsklearnex.neighbors.knn_classification\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):\n    atom.plot_results(metric=\"time_fit\", title=\"Training\")\n    atom.plot_results(metric=\"time\", title=\"Total\")\n
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"): atom.plot_results(metric=\"time_fit\", title=\"Training\") atom.plot_results(metric=\"time\", title=\"Total\")"}, {"location": "examples/accelerating_sklearnex/#example-accelerating-pipelines", "title": "Example: Accelerating pipelines\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to accelerate your models on cpu using sklearnex.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/accelerating_sklearnex/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/advanced_plotting/", "title": "Advanced plotting", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1)\natom.impute()\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1) atom.impute() atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's see how the default aesthetics looks like\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's see how the default aesthetics looks like atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Change the color palette using color names or their hex codes\natom.palette = [\"red\", \"#00f\"]\n
                                                                                                                                                                                                                                                                                                                                                                  # Change the color palette using color names or their hex codes atom.palette = [\"red\", \"#00f\"] In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Change the title and label fontsize\natom.title_fontsize = 30\natom.label_fontsize = 24\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Change the title and label fontsize atom.title_fontsize = 30 atom.label_fontsize = 24 atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the update_layout method to change layout properties\natom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\")\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the update_layout method to change layout properties atom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\") atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the update_traces method to change the trace (note the y-axis)\natom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\"))\natom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the update_traces method to change the trace (note the y-axis) atom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\")) atom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\") In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's go back to the default aesthetics\natom.reset_aesthetics()\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's go back to the default aesthetics atom.reset_aesthetics() atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # And update the title with some custom fonts\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    )\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # And update the title with some custom fonts atom.plot_distribution( columns=[1, 2], title=dict( text=\"Distribution of temperatures\", font_color=\"teal\", x=0, xanchor=\"left\", ) ) In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # We can update the legend in a similar fashion\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    ),\n    legend=dict(title=\"Legend's title\"),\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # We can update the legend in a similar fashion atom.plot_distribution( columns=[1, 2], title=dict( text=\"Distribution of temperatures\", font_color=\"teal\", x=0, xanchor=\"left\", ), legend=dict(title=\"Legend's title\"), ) In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"LR\")\n\n# You can plot the ROC curve for a selection of rows,\n# for example, for rows in a specific location\natom.plot_roc(\n    rows={\n        \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],\n        \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],\n    }\n)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"LR\") # You can plot the ROC curve for a selection of rows, # for example, for rows in a specific location atom.plot_roc( rows={ \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"], \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"], } )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6247\nTest evaluation --> f1: 0.6093\nTime elapsed: 0.636s\n-------------------------------------------------\nTime: 0.636s\n\n\nFinal results ==================== >>\nTotal time: 1.044s\n-------------------------------------\nLogisticRegression --> f1: 0.6093\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note how the same column over different plots is grouped\nwith atom.canvas(2, 2):\n    atom.plot_distribution(columns=1)\n    atom.plot_distribution(columns=2)\n    atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])\n    atom.plot_qq(columns=[1, 2])\n
                                                                                                                                                                                                                                                                                                                                                                  # Note how the same column over different plots is grouped with atom.canvas(2, 2): atom.plot_distribution(columns=1) atom.plot_distribution(columns=2) atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"]) atom.plot_qq(columns=[1, 2])"}, {"location": "examples/advanced_plotting/#example-advanced-plotting", "title": "Example: Advanced plotting\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to make the best use of all of atom's plotting options.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/advanced_plotting/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-colors-and-font-size", "title": "Customize colors and font size\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-layout", "title": "Customize the plot's layout\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-traces", "title": "Customize the plot's traces\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-title-and-legend", "title": "Customize the title and legend\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customizing-the-rows-to-plot", "title": "Customizing the rows to plot\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#using-a-canvas", "title": "Using a canvas\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/", "title": "Automated feature scaling", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check which models require feature scaling\natom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # Check which models require feature scaling atom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]] Out[4]: acronym model needs_scaling 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost True 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree False 7 Dummy Dummy False 8 ETree ExtraTree False 9 ET ExtraTrees False 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive True 22 Perc Perceptron True 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest False 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # We fit two models: LR needs scaling and Bag doesn't\natom.run([\"LR\", \"Bag\"])\n
                                                                                                                                                                                                                                                                                                                                                                  # We fit two models: LR needs scaling and Bag doesn't atom.run([\"LR\", \"Bag\"])
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR, Bag\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9861\nTime elapsed: 0.051s\n-------------------------------------------------\nTime: 0.051s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9982\nTest evaluation --> f1: 0.9444\nTime elapsed: 0.111s\n-------------------------------------------------\nTime: 0.111s\n\n\nFinal results ==================== >>\nTotal time: 0.216s\n-------------------------------------\nLogisticRegression --> f1: 0.9861 !\nBagging            --> f1: 0.9444\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Now, we create a new branch and scale the features before fitting the model\natom.branch = \"scaling\"\n
                                                                                                                                                                                                                                                                                                                                                                  # Now, we create a new branch and scale the features before fitting the model atom.branch = \"scaling\"
                                                                                                                                                                                                                                                                                                                                                                  Successfully created new branch: scaling.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.scale()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.scale()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"LR_2\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"LR_2\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR_2\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9913\nTest evaluation --> f1: 0.9861\nTime elapsed: 0.035s\n-------------------------------------------------\nTime: 0.035s\n\n\nFinal results ==================== >>\nTotal time: 0.057s\n-------------------------------------\nLogisticRegression --> f1: 0.9861\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's compare the differences between the models\nprint(atom.lr.scaler)\nprint(atom.bag.scaler)\nprint(atom.lr_2.scaler)\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's compare the differences between the models print(atom.lr.scaler) print(atom.bag.scaler) print(atom.lr_2.scaler)
                                                                                                                                                                                                                                                                                                                                                                  Scaler()\nNone\nNone\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # And the data they use is different\nprint(atom.lr.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.bag.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.lr_2.X_train.equals(atom.lr.X_train))\n
                                                                                                                                                                                                                                                                                                                                                                  # And the data they use is different print(atom.lr.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.bag.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.lr_2.X_train.equals(atom.lr.X_train))
                                                                                                                                                                                                                                                                                                                                                                           x0        x1        x2\n0 -0.181875  0.356669 -0.147122\n1  1.162216  0.300578  1.159704\n2  1.056470  1.212060  0.933833\n3  0.277287  2.457753  0.188054\n4 -1.442482 -0.825921 -1.343434\n-----------------------------\n      x0     x1      x2\n0  13.48  20.82   88.40\n1  18.31  20.58  120.80\n2  17.93  24.48  115.20\n3  15.13  29.81   96.71\n4   8.95  15.76   58.74\n-----------------------------\nTrue\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that the scaler is included in the model's pipeline\nprint(atom.lr.pipeline)\nprint(\"-----------------------------\")\nprint(atom.bag.pipeline)\nprint(\"-----------------------------\")\nprint(atom.lr_2.pipeline)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that the scaler is included in the model's pipeline print(atom.lr.pipeline) print(\"-----------------------------\") print(atom.bag.pipeline) print(\"-----------------------------\") print(atom.lr_2.pipeline)
                                                                                                                                                                                                                                                                                                                                                                  Pipeline(memory=Memory(location=None), steps=[('AutomatedScaler', Scaler())])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[('Scaler', Scaler(verbose=2))])\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_pipeline()"}, {"location": "examples/automated_feature_scaling/#example-automated-feature-scaling", "title": "Example: Automated feature scaling\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how ATOM handles models that require automated feature scaling.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/automated_feature_scaling/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/binary_classification/", "title": "Binary classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Call atom using only 5% of the complete dataset (for explanatory purposes)\natom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Call atom using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 8 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (7109, 22)\nTrain set size: 5688\nTest set size: 1421\n-------------------------------------\nMemory: 1.25 MB\nScaled: False\nMissing values: 15868 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 1 (0.0%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Impute missing values\natom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)\n
                                                                                                                                                                                                                                                                                                                                                                  # Impute missing values atom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 7 samples for containing more than 16 missing values.\n --> Imputing 23 missing values with median (11.9) in feature MinTemp.\n --> Imputing 10 missing values with median (22.6) in feature MaxTemp.\n --> Imputing 72 missing values with median (0.0) in feature Rainfall.\n --> Imputing 3059 missing values with median (4.6) in feature Evaporation.\n --> Imputing 3382 missing values with median (8.5) in feature Sunshine.\n --> Dropping 467 samples due to missing values in feature WindGustDir.\n --> Imputing 466 missing values with median (39.0) in feature WindGustSpeed.\n --> Dropping 479 samples due to missing values in feature WindDir9am.\n --> Dropping 165 samples due to missing values in feature WindDir3pm.\n --> Imputing 53 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 115 missing values with median (17.0) in feature WindSpeed3pm.\n --> Imputing 72 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 164 missing values with median (52.0) in feature Humidity3pm.\n --> Imputing 699 missing values with median (1017.7) in feature Pressure9am.\n --> Imputing 699 missing values with median (1015.4) in feature Pressure3pm.\n --> Imputing 2698 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 2903 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 32 missing values with median (16.7) in feature Temp9am.\n --> Imputing 116 missing values with median (21.1) in feature Temp3pm.\n --> Dropping 72 samples due to missing values in feature RainToday.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Encode the categorical features\natom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)\n
                                                                                                                                                                                                                                                                                                                                                                  # Encode the categorical features atom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)
                                                                                                                                                                                                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 47 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Train an Extra-Trees and a Random Forest model\natom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)\n
                                                                                                                                                                                                                                                                                                                                                                  # Train an Extra-Trees and a Random Forest model atom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: ET, RF\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.5956\nTime elapsed: 1.414s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5709 \u00b1 0.0198\nTime elapsed: 1.020s\n-------------------------------------------------\nTime: 2.434s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.6124\nTime elapsed: 0.337s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5802 \u00b1 0.0111\nTime elapsed: 1.281s\n-------------------------------------------------\nTime: 1.618s\n\n\nFinal results ==================== >>\nTotal time: 4.225s\n-------------------------------------\nExtraTrees   --> f1: 0.5709 \u00b1 0.0198 ~\nRandomForest --> f1: 0.5802 \u00b1 0.0111 ~ !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the final results\natom.results\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the final results atom.results Out[7]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time ET 0.8503 0.5688 1.414043 0.570892 1.019728 2.433771 RF 0.8552 0.5612 0.336765 0.580178 1.281000 1.617765 In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the bootstrap results\natom.plot_results(title=\"RF vs ET performance\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the bootstrap results atom.plot_results(title=\"RF vs ET performance\") In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Print the results of some common metrics\natom.evaluate()\n
                                                                                                                                                                                                                                                                                                                                                                  # Print the results of some common metrics atom.evaluate() Out[9]: accuracy ap ba f1 jaccard mcc precision recall auc ET 0.8478 0.6904 0.7059 0.5688 0.3974 0.5108 0.7750 0.4493 0.8561 RF 0.8405 0.6775 0.7038 0.5612 0.3901 0.4891 0.7283 0.4565 0.8502 In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The winner attribute calls the best model (atom.winner == atom.rf)\nprint(f\"The winner is the {atom.winner.name} model!!\")\n
                                                                                                                                                                                                                                                                                                                                                                  # The winner attribute calls the best model (atom.winner == atom.rf) print(f\"The winner is the {atom.winner.name} model!!\")
                                                                                                                                                                                                                                                                                                                                                                  The winner is the RF model!!\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the distribution of predicted probabilities\natom.winner.plot_probabilities()\n
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the distribution of predicted probabilities atom.winner.plot_probabilities() In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Compare how different metrics perform for different thresholds\natom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)\n
                                                                                                                                                                                                                                                                                                                                                                  # Compare how different metrics perform for different thresholds atom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)"}, {"location": "examples/binary_classification/#example-binary-classification", "title": "Example: Binary classification\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to solve a binary classification problem. Additonnaly, we'll perform a variety of data cleaning steps to prepare the data for modeling.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/binary_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/binary_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/binary_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/calibration/", "title": "Calibration", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)\n\n# Apply data cleaning steps\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")\natom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)\n\n# Train a linear SVM\natom.run(\"gnb\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False) # Apply data cleaning steps atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05) # Train a linear SVM atom.run(\"gnb\")
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (10000, 22)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 1.76 MB\nScaled: False\nMissing values: 22184 (10.1%)\nCategorical features: 5 (23.8%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= >>\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5807\nTest evaluation --> f1: 0.5971\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.094s\n\n\nFinal results ==================== >>\nTotal time: 0.160s\n-------------------------------------\nGaussianNB --> f1: 0.5971\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check the model's calibration\natom.plot_calibration()\n
                                                                                                                                                                                                                                                                                                                                                                  # Check the model's calibration atom.plot_calibration() In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's try to improve it using the calibrate method\natom.winner.calibrate(method=\"isotonic\", cv=5)\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's try to improve it using the calibrate method atom.winner.calibrate(method=\"isotonic\", cv=5)
                                                                                                                                                                                                                                                                                                                                                                  Results for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5034\nTest evaluation --> f1: 0.5061\nTime elapsed: 0.282s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # And check again...\natom.plot_calibration()\n
                                                                                                                                                                                                                                                                                                                                                                  # And check again... atom.plot_calibration()"}, {"location": "examples/calibration/#example-calibration", "title": "Example: Calibration\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to calibrate a classifier through atom.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/calibration/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/calibration/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/calibration/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/deep_learning/", "title": "Deep learning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport absl.logging\nabsl.logging.set_verbosity(absl.logging.ERROR)\n\nfrom atom import ATOMClassifier, ATOMModel\nfrom sklearn.preprocessing import FunctionTransformer\nfrom optuna.pruners import PatientPruner\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\nfrom scikeras.wrappers import KerasClassifier\nfrom keras.datasets import mnist\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Flatten, Conv2D, Dropout\n
                                                                                                                                                                                                                                                                                                                                                                  # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" from tensorflow import get_logger get_logger().setLevel('ERROR') import absl.logging absl.logging.set_verbosity(absl.logging.ERROR) from atom import ATOMClassifier, ATOMModel from sklearn.preprocessing import FunctionTransformer from optuna.pruners import PatientPruner from optuna.distributions import CategoricalDistribution, IntDistribution from scikeras.wrappers import KerasClassifier from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Flatten, Conv2D, Dropout In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Download the MNIST dataset\n(X_train, y_train), (X_test, y_test) = mnist.load_data()\n\n# Flatten data to follow sklearn's API (2d input)\nX_train = X_train.reshape(len(X_train), -1)\nX_test = X_test.reshape(len(X_test), -1)\n\ndata = (X_train, y_train), (X_test, y_test)\n
                                                                                                                                                                                                                                                                                                                                                                  # Download the MNIST dataset (X_train, y_train), (X_test, y_test) = mnist.load_data() # Flatten data to follow sklearn's API (2d input) X_train = X_train.reshape(len(X_train), -1) X_test = X_test.reshape(len(X_test), -1) data = (X_train, y_train), (X_test, y_test) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create the convolutional neural network\nclass ConvNN(KerasClassifier):\n    \"\"\"Convolutional neural network model.\"\"\"\n\n    @property\n    def feature_encoder(self):\n        \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"\n        return FunctionTransformer(\n            func=lambda X: X.reshape(X.shape[0], 28, 28, 1),\n        )\n\n    @staticmethod\n    def _keras_build_fn(**kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(\n            Conv2D(\n                filters=8,\n                kernel_size=3,\n                activation=\"relu\",\n                input_shape=(28, 28, 1),\n            )\n        )\n        model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))\n        model.add(Flatten())\n        model.add(Dense(units=10, activation=\"softmax\"))\n        model.compile(\n            optimizer=\"adam\",\n            loss=\"sparse_categorical_crossentropy\",\n        )\n\n        return model\n
                                                                                                                                                                                                                                                                                                                                                                  # Create the convolutional neural network class ConvNN(KerasClassifier): \"\"\"Convolutional neural network model.\"\"\" @property def feature_encoder(self): \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\" return FunctionTransformer( func=lambda X: X.reshape(X.shape[0], 28, 28, 1), ) @staticmethod def _keras_build_fn(**kwargs): \"\"\"Create the model's architecture.\"\"\" model = Sequential() model.add( Conv2D( filters=8, kernel_size=3, activation=\"relu\", input_shape=(28, 28, 1), ) ) model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\")) model.add(Flatten()) model.add(Dense(units=10, activation=\"softmax\")) model.compile( optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", ) return model In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=ConvNN(verbose=0),\n    acronym=\"CNN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    has_validation=\"epochs\",  # Applies in-training validation on parameter epochs\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Convert the model to an ATOM model model = ATOMModel( estimator=ConvNN(verbose=0), acronym=\"CNN\", needs_scaling=True, # Applies automated feature scaling before fitting has_validation=\"epochs\", # Applies in-training validation on parameter epochs ) In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== >>\nShape: (7000, 785)\nTrain set size: 6000\nTest set size: 1000\n-------------------------------------\nMemory: 5.54 MB\nScaled: False\nOutlier values: 41839 (0.9%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Like any other model, we can define custom distributions for hyperparameter tuning\natom.run(\n    models=model,\n    metric=\"f1_weighted\",\n    n_trials=12,\n    ht_params={\n        \"distributions\": {\n            \"epochs\": IntDistribution(2, 10),\n            \"batch_size\": CategoricalDistribution([128, 256, 512]),\n        },\n    }\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Like any other model, we can define custom distributions for hyperparameter tuning atom.run( models=model, metric=\"f1_weighted\", n_trials=12, ht_params={ \"distributions\": { \"epochs\": IntDistribution(2, 10), \"batch_size\": CategoricalDistribution([128, 256, 512]), }, } )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: CNN\nMetric: f1_weighted\n\n\nRunning hyperparameter tuning for ConvNN...\n| trial |  epochs | batch_size | f1_weighted | best_f1_weighted | time_trial | time_ht |    state |\n| ----- | ------- | ---------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |       5 |        128 |      0.9147 |           0.9147 |     9.127s |  9.127s | COMPLETE |\n| 1     |       3 |        512 |      0.8539 |           0.9147 |     4.995s | 14.122s | COMPLETE |\n| 2     |       5 |        512 |      0.8931 |           0.9147 |     7.712s | 21.834s | COMPLETE |\n| 3     |       3 |        128 |       0.901 |           0.9147 |     5.706s | 27.540s | COMPLETE |\n| 4     |       5 |        128 |      0.9147 |           0.9147 |     0.607s | 28.147s | COMPLETE |\n| 5     |       9 |        128 |      0.9251 |           0.9251 |    15.297s | 43.443s | COMPLETE |\n| 6     |       9 |        128 |      0.9251 |           0.9251 |     1.230s | 44.673s | COMPLETE |\n| 7     |       3 |        128 |       0.901 |           0.9251 |     0.636s | 45.309s | COMPLETE |\n| 8     |      10 |        256 |      0.8131 |           0.9251 |     2.573s | 47.882s |   PRUNED |\n| 9     |       8 |        128 |      0.9191 |           0.9251 |    14.014s | 01m:02s |   PRUNED |\n| 10    |       7 |        256 |       0.836 |           0.9251 |     2.498s | 01m:04s |   PRUNED |\n| 11    |      10 |        128 |      0.9431 |           0.9431 |    16.725s | 01m:21s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 11\nBest parameters:\n --> epochs: 10\n --> batch_size: 128\nBest evaluation --> f1_weighted: 0.9431\nTime elapsed: 01m:21s\nFit ---------------------------------------------\nTrain evaluation --> f1_weighted: 0.9835\nTest evaluation --> f1_weighted: 0.952\nTime elapsed: 28.600s\n-------------------------------------------------\nTime: 01m:50s\n\n\nFinal results ==================== >>\nTotal time: 03m:39s\n-------------------------------------\nConvNN --> f1_weighted: 0.952\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.cnn.trials\n
                                                                                                                                                                                                                                                                                                                                                                  atom.cnn.trials Out[7]: epochs batch_size estimator f1_weighted best_f1_weighted time_trial time_ht state trial 0 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 9.126504 9.126504 COMPLETE 1 3 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.853919 0.943121 4.995052 14.121556 COMPLETE 2 5 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.893128 0.943121 7.712461 21.834017 COMPLETE 3 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 5.705581 27.539598 COMPLETE 4 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 0.607057 28.146655 COMPLETE 5 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 15.296670 43.443325 COMPLETE 6 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 1.229779 44.673104 COMPLETE 7 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 0.635578 45.308682 COMPLETE 8 10 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.813073 0.943121 2.573343 47.882025 PRUNED 9 8 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.919095 0.943121 14.014060 61.896085 PRUNED 10 7 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.835966 0.943121 2.498169 64.394254 PRUNED 11 10 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.943121 0.943121 16.725048 81.119302 COMPLETE In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_evals(dataset=\"test+train\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_evals(dataset=\"test+train\") In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the prediction methods like any other model\natom.cnn.predict_proba(X_train)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the prediction methods like any other model atom.cnn.predict_proba(X_train) Out[9]: 0 1 2 3 4 5 6 7 8 9 0 6.981344e-08 1.163047e-08 1.302092e-07 7.298404e-01 4.980663e-11 2.701415e-01 6.764501e-11 1.982446e-06 5.807213e-07 1.532895e-05 1 9.999958e-01 2.160013e-12 2.527803e-06 1.498349e-07 2.094386e-09 4.418725e-07 6.460270e-07 2.255171e-07 2.042284e-08 7.188346e-08 2 1.154879e-10 2.405690e-10 1.185454e-07 3.165163e-07 9.995613e-01 1.887145e-11 6.159564e-12 4.155245e-04 1.546579e-09 2.274483e-05 3 5.565947e-07 9.992028e-01 6.758810e-04 3.334095e-06 2.312364e-05 9.298934e-08 1.309337e-07 7.859311e-05 1.515798e-05 3.681653e-07 4 4.683458e-09 4.092270e-08 3.246872e-07 1.020155e-06 2.804452e-03 9.423515e-08 3.789635e-12 8.406813e-03 7.883451e-05 9.887084e-01 ... ... ... ... ... ... ... ... ... ... ... 59995 7.329114e-09 4.127999e-08 3.695257e-06 1.461548e-04 1.231008e-09 6.157245e-06 2.624072e-11 8.209722e-09 9.998319e-01 1.199038e-05 59996 6.239399e-08 2.397851e-09 1.575265e-03 9.643788e-01 8.514269e-08 1.101398e-04 1.774388e-10 1.135693e-07 3.362476e-02 3.106496e-04 59997 7.059591e-10 5.808693e-09 1.657147e-11 3.829917e-05 3.490374e-07 9.998387e-01 4.054391e-11 4.646493e-11 1.087904e-04 1.385001e-05 59998 1.183419e-05 2.104532e-09 1.940764e-06 1.050059e-07 8.195059e-06 5.124656e-06 9.999721e-01 4.185512e-09 7.723169e-07 1.096977e-09 59999 3.987676e-04 1.140556e-06 4.448286e-04 4.279935e-06 1.410985e-07 2.539659e-03 8.256741e-08 8.921248e-08 9.958331e-01 7.779775e-04

                                                                                                                                                                                                                                                                                                                                                                  60000 rows \u00d7 10 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Or make plots...\natom.cnn.plot_hyperparameters()\n
                                                                                                                                                                                                                                                                                                                                                                  # Or make plots... atom.cnn.plot_hyperparameters() In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_parallel_coordinate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_parallel_coordinate()"}, {"location": "examples/deep_learning/#example-deep-learning", "title": "Example: Deep learning\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to train and validate a Convolutional Neural Network implemented with Keras using scikeras.

                                                                                                                                                                                                                                                                                                                                                                  Import the MNIST dataset from keras.datasets. This is a well known image dataset whose goal is to classify handwritten digits.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/deep_learning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/deep_learning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/deep_learning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ensembles/", "title": "Ensembles", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True, as_frame=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and train several models\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\natom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and train several models atom = ATOMClassifier(X, y, verbose=2, random_state=1) atom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n\nTraining ========================= >>\nModels: LR, Tree, LGB\nMetric: accuracy\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 0.989\nTest evaluation --> accuracy: 0.9823\nTime elapsed: 0.048s\n-------------------------------------------------\nTime: 0.048s\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.246s\n-------------------------------------------------\nTime: 0.246s\n\n\nFinal results ==================== >>\nTotal time: 0.419s\n-------------------------------------\nLogisticRegression --> accuracy: 0.9823 !\nDecisionTree       --> accuracy: 0.9469\nLightGBM           --> accuracy: 0.9469\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Combine the models into a Voting model\natom.voting(voting=\"soft\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Combine the models into a Voting model atom.voting(voting=\"soft\")
                                                                                                                                                                                                                                                                                                                                                                  Results for Voting:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 1.0\nTest evaluation --> accuracy: 0.9469\nTime elapsed: 0.055s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that we now have an extra model in the pipeline\natom.models\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that we now have an extra model in the pipeline atom.models Out[5]:
                                                                                                                                                                                                                                                                                                                                                                  ['LR', 'Tree', 'LGB', 'Vote']
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The plot_pipeline method helps us visualize the ensemble\natom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  # The plot_pipeline method helps us visualize the ensemble atom.plot_pipeline() In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The Vote model averages the scores of the models it contains\natom.vote\n
                                                                                                                                                                                                                                                                                                                                                                  # The Vote model averages the scores of the models it contains atom.vote Out[7]:
                                                                                                                                                                                                                                                                                                                                                                  Voting()
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # We can use it like any other model to make predictions or plots\natom.vote.predict_proba(range(10))\n
                                                                                                                                                                                                                                                                                                                                                                  # We can use it like any other model to make predictions or plots atom.vote.predict_proba(range(10)) Out[8]: 0 1 0 0.961516 0.038484 1 0.999968 0.000032 2 0.998743 0.001257 3 0.968071 0.031929 4 0.000014 0.999986 5 0.999991 0.000009 6 0.000019 0.999981 7 0.000015 0.999985 8 0.000026 0.999974 9 0.002627 0.997373 In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"])\n
                                                                                                                                                                                                                                                                                                                                                                  atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"]) In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_results(legend=None)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_results(legend=None) In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.delete(\"vote\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.delete(\"vote\")
                                                                                                                                                                                                                                                                                                                                                                  Deleting 1 models...\n --> Model Vote successfully deleted.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Just like Voting, we can create a Stacking model\natom.stacking(final_estimator=\"LDA\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Just like Voting, we can create a Stacking model atom.stacking(final_estimator=\"LDA\")
                                                                                                                                                                                                                                                                                                                                                                  Results for Stacking:\nFit ---------------------------------------------\nTrain evaluation --> accuracy: 0.9934\nTest evaluation --> accuracy: 0.9823\nTime elapsed: 0.728s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The final estimator uses the predictions of the underlying models\natom.stack.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # The final estimator uses the predictions of the underlying models atom.stack.head() Out[13]: mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension target 0 13.48 20.82 88.40 559.2 0.10160 0.12550 0.10630 0.05439 0.1720 0.06419 ... 26.02 107.30 740.4 0.1610 0.42250 0.5030 0.22580 0.2807 0.10710 0 1 18.31 20.58 120.80 1052.0 0.10680 0.12480 0.15690 0.09451 0.1860 0.05941 ... 26.20 142.20 1493.0 0.1492 0.25360 0.3759 0.15100 0.3074 0.07863 0 2 17.93 24.48 115.20 998.9 0.08855 0.07027 0.05699 0.04744 0.1538 0.05510 ... 34.69 135.10 1320.0 0.1315 0.18060 0.2080 0.11360 0.2504 0.07948 0 3 15.13 29.81 96.71 719.5 0.08320 0.04605 0.04686 0.02739 0.1852 0.05294 ... 36.91 110.10 931.4 0.1148 0.09866 0.1547 0.06575 0.3233 0.06165 0 4 8.95 15.76 58.74 245.2 0.09462 0.12430 0.09263 0.02308 0.1305 0.07163 ... 17.07 63.34 270.0 0.1179 0.18790 0.1544 0.03846 0.1652 0.07722 1

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 31 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Again, the model can be used for predictions or plots\natom.stack.predict(X)\n
                                                                                                                                                                                                                                                                                                                                                                  # Again, the model can be used for predictions or plots atom.stack.predict(X) Out[14]:
                                                                                                                                                                                                                                                                                                                                                                  0      0\n1      0\n2      0\n3      0\n4      1\n      ..\n564    1\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: int64
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.stack.plot_shap_beeswarm(show=10)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.stack.plot_shap_beeswarm(show=10)
                                                                                                                                                                                                                                                                                                                                                                  PermutationExplainer explainer: 114it [00:48,  2.01it/s]                                                                                                                                                                                                                                                             \n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/ensembles/#example-ensembles", "title": "Example: Ensembles\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use atom's ensemble techniques to improve predictions on a dataset combining several models.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/ensembles/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ensembles/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ensembles/#voting", "title": "Voting\u00b6", "text": ""}, {"location": "examples/ensembles/#stacking", "title": "Stacking\u00b6", "text": ""}, {"location": "examples/feature_engineering/", "title": "Feature engineering", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and apply data cleaning\natom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0)\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\natom.encode(max_onehot=10, infrequent_to_value=0.04)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0) atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) atom.encode(max_onehot=10, infrequent_to_value=0.04) In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.verbose = 2  # Increase verbosity to see the output\n\n# Let's see how a LightGBM model performs\natom.run('LGB', metric='auc')\n
                                                                                                                                                                                                                                                                                                                                                                  atom.verbose = 2 # Increase verbosity to see the output # Let's see how a LightGBM model performs atom.run('LGB', metric='auc')
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9817\nTest evaluation --> auc: 0.8584\nTime elapsed: 0.831s\n-------------------------------------------------\nTime: 0.831s\n\n\nFinal results ==================== >>\nTotal time: 0.963s\n-------------------------------------\nLightGBM --> auc: 0.8584\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Since we are going to compare different datasets,\n# we need to create separate branches\natom.branch = \"dfs\"\n
                                                                                                                                                                                                                                                                                                                                                                  # Since we are going to compare different datasets, # we need to create separate branches atom.branch = \"dfs\"
                                                                                                                                                                                                                                                                                                                                                                  Successfully created new branch: dfs.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create 50 new features using dfs\natom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])\n
                                                                                                                                                                                                                                                                                                                                                                  # Create 50 new features using dfs atom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureGenerator...\nGenerating new features...\n --> 50 new features were added.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The warnings warn us that some operators created missing values!\n# We can see the columns with missing values using the nans attribute\natom.nans\n
                                                                                                                                                                                                                                                                                                                                                                  # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the nans attribute atom.nans Out[7]:
                                                                                                                                                                                                                                                                                                                                                                  Location                       0\nMinTemp                        0\nMaxTemp                        0\nRainfall                       0\nEvaporation                    0\n                              ..\nTemp9am - WindDir3pm           0\nWindDir9am + WindGustSpeed     0\nWindDir9am + WindSpeed3pm      0\nWindGustDir + WindSpeed9am     0\nWindSpeed3pm - WindSpeed9am    0\nLength: 73, dtype: int64
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Turn off warnings in the future\natom.warnings = False\n\n# Impute the data again to get rid of the missing values\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\n
                                                                                                                                                                                                                                                                                                                                                                  # Turn off warnings in the future atom.warnings = False # Impute the data again to get rid of the missing values atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Imputing 12 missing values using the KNN imputer in feature NATURAL_LOGARITHM(Temp3pm).\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # 50 new features may be to much...\n# Let's check for multicollinearity and use rfecv to reduce the number\natom.feature_selection(\n    strategy=\"rfecv\",\n    solver=\"LGB\",\n    n_features=30,\n    scoring=\"auc\",\n    max_correlation=0.98,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # 50 new features may be to much... # Let's check for multicollinearity and use rfecv to reduce the number atom.feature_selection( strategy=\"rfecv\", solver=\"LGB\", n_features=30, scoring=\"auc\", max_correlation=0.98, )
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> Feature MinTemp was removed due to collinearity with another feature.\n --> Feature MinTemp + RainToday_No was removed due to collinearity with another feature.\n --> Feature MaxTemp was removed due to collinearity with another feature.\n --> Feature MaxTemp + WindDir3pm was removed due to collinearity with another feature.\n --> Feature MaxTemp + WindGustDir was removed due to collinearity with another feature.\n --> Feature Rainfall was removed due to collinearity with another feature.\n --> Feature Rainfall + RainToday_rare was removed due to collinearity with another feature.\n --> Feature Rainfall + WindDir3pm was removed due to collinearity with another feature.\n --> Feature Sunshine was removed due to collinearity with another feature.\n --> Feature Sunshine - WindDir3pm was removed due to collinearity with another feature.\n --> Feature WindGustSpeed was removed due to collinearity with another feature.\n --> Feature WindSpeed9am was removed due to collinearity with another feature.\n --> Feature WindSpeed3pm was removed due to collinearity with another feature.\n --> Feature Humidity9am was removed due to collinearity with another feature.\n --> Feature Humidity3pm was removed due to collinearity with another feature.\n --> Feature NATURAL_LOGARITHM(Pressure3pm) was removed due to collinearity with another feature.\n --> Feature Pressure3pm - RainToday_Yes was removed due to collinearity with another feature.\n --> Feature Cloud9am + RainToday_No was removed due to collinearity with another feature.\n --> Feature Cloud3pm was removed due to collinearity with another feature.\n --> Feature Cloud3pm + Location was removed due to collinearity with another feature.\n --> Feature Temp9am - WindDir3pm was removed due to collinearity with another feature.\n --> Feature Temp3pm was removed due to collinearity with another feature.\n --> Feature Temp3pm - WindDir9am was removed due to collinearity with another feature.\n --> Feature RainToday_rare was removed due to collinearity with another feature.\n --> rfecv selected 38 features from the dataset.\n   --> Dropping feature Location (rank 12).\n   --> Dropping feature Cloud9am (rank 2).\n   --> Dropping feature RainToday_No (rank 10).\n   --> Dropping feature RainToday_Yes (rank 11).\n   --> Dropping feature Location + RainToday_rare (rank 9).\n   --> Dropping feature Location - Pressure9am (rank 4).\n   --> Dropping feature Location - Temp9am (rank 7).\n   --> Dropping feature Location - WindGustDir (rank 8).\n   --> Dropping feature RainToday_No - WindSpeed3pm (rank 3).\n   --> Dropping feature RainToday_rare + Temp3pm (rank 5).\n   --> Dropping feature Rainfall + RainToday_Yes (rank 6).\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The collinear attribute shows what features were removed due to multicollinearity\natom.collinear_\n
                                                                                                                                                                                                                                                                                                                                                                  # The collinear attribute shows what features were removed due to multicollinearity atom.collinear_ Out[10]: drop corr_feature corr_value 0 MinTemp MinTemp + RainToday_No, MinTemp + RainToday_Yes 0.9978, 0.9979 1 MinTemp + RainToday_No MinTemp, MinTemp + RainToday_Yes 0.9978, 0.9914 2 MaxTemp MaxTemp + WindDir3pm, MaxTemp + WindDir9am, Ma... 1.0, 1.0, 1.0 3 MaxTemp + WindDir3pm MaxTemp, MaxTemp + WindDir9am, MaxTemp + WindG... 1.0, 1.0, 1.0 4 MaxTemp + WindGustDir MaxTemp, MaxTemp + WindDir3pm, MaxTemp + WindD... 1.0, 1.0, 1.0 5 Rainfall Rainfall + RainToday_Yes, Rainfall + RainToday... 0.999, 0.9999, 1.0 6 Rainfall + RainToday_rare Rainfall, Rainfall + RainToday_Yes, Rainfall +... 0.9999, 0.9989, 0.9999 7 Rainfall + WindDir3pm Rainfall, Rainfall + RainToday_Yes, Rainfall +... 1.0, 0.999, 0.9999 8 Sunshine RainToday_rare + Sunshine, Sunshine - WindDir3pm 0.9994, 0.9998 9 Sunshine - WindDir3pm Sunshine, RainToday_rare + Sunshine 0.9998, 0.9993 10 WindGustSpeed WindDir9am + WindGustSpeed 1.0 11 WindSpeed9am WindGustDir + WindSpeed9am 1.0 12 WindSpeed3pm WindDir9am + WindSpeed3pm 1.0 13 Humidity9am Humidity9am + WindGustDir 1.0 14 Humidity3pm Humidity3pm - Sunshine 0.9937 15 NATURAL_LOGARITHM(Pressure3pm) Pressure3pm, Pressure3pm - RainToday_Yes 1.0, 0.9981 16 Pressure3pm - RainToday_Yes Pressure3pm, NATURAL_LOGARITHM(Pressure3pm) 0.9981, 0.9981 17 Cloud9am + RainToday_No Cloud9am 0.9828 18 Cloud3pm Cloud3pm + Location, Cloud3pm + RainToday_rare 1.0, 0.9991 19 Cloud3pm + Location Cloud3pm, Cloud3pm + RainToday_rare 1.0, 0.9991 20 Temp9am - WindDir3pm Temp9am 1.0 21 Temp3pm RainToday_rare + Temp3pm, Temp3pm - WindDir9am 0.9999, 1.0 22 Temp3pm - WindDir9am Temp3pm, RainToday_rare + Temp3pm 1.0, 0.9999 23 RainToday_rare Location + RainToday_rare 1.0 In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # After applying rfecv, we can plot the score per number of features\natom.plot_rfecv()\n
                                                                                                                                                                                                                                                                                                                                                                  # After applying rfecv, we can plot the score per number of features atom.plot_rfecv() In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's see how the model performs now\n# Add a tag to the model's acronym to not overwrite previous LGB\natom.run(\"LGB_dfs\", errors=\"raise\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's see how the model performs now # Add a tag to the model's acronym to not overwrite previous LGB atom.run(\"LGB_dfs\", errors=\"raise\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB_dfs\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9893\nTest evaluation --> auc: 0.8572\nTime elapsed: 1.045s\n-------------------------------------------------\nTime: 1.045s\n\n\nFinal results ==================== >>\nTotal time: 1.186s\n-------------------------------------\nLightGBM --> auc: 0.8572\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create another branch for the genetic features\n# Split form master to avoid the dfs features\natom.branch = \"gfg_from_main\"\n
                                                                                                                                                                                                                                                                                                                                                                  # Create another branch for the genetic features # Split form master to avoid the dfs features atom.branch = \"gfg_from_main\"
                                                                                                                                                                                                                                                                                                                                                                  Successfully created new branch: gfg.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create new features using Genetic Programming\natom.feature_generation(strategy='gfg', n_features=20)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create new features using Genetic Programming atom.feature_generation(strategy='gfg', n_features=20)
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureGenerator...\n    |   Population Average    |             Best Individual              |\n---- ------------------------- ------------------------------------------ ----------\n Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left\n   0     3.08         0.137852        3         0.505879              N/A     18.62s\n   1     3.30         0.332951        6         0.506041              N/A     19.23s\n   2     3.92         0.429317        7         0.525775              N/A     18.31s\n   3     4.64         0.459817        9         0.532823              N/A     16.25s\n   4     6.59         0.475058       11         0.540078              N/A     15.51s\n   5     8.04         0.498345       13          0.54114              N/A     14.56s\n   6     9.80         0.509423       13         0.543911              N/A     13.87s\n   7    10.86         0.513225       15         0.551242              N/A     13.28s\n   8    11.54         0.513973       15         0.554127              N/A     11.99s\n   9    12.21         0.516725       19         0.554172              N/A     11.44s\n  10    13.09         0.520543       17         0.556923              N/A     10.19s\n  11    13.24         0.519283       17         0.556923              N/A      9.07s\n  12    12.74          0.51949       21         0.558114              N/A      7.95s\n  13    13.88         0.521709       21         0.558114              N/A      6.68s\n  14    15.99         0.523381       19         0.558673              N/A      6.12s\n  15    16.74         0.523708       19         0.558673              N/A      7.97s\n  16    16.84         0.524509       19         0.560449              N/A      6.02s\n  17    16.79         0.525061       19         0.560449              N/A      2.26s\n  18    16.77         0.523639       21         0.561281              N/A      1.11s\n  19    17.03         0.524261       23         0.561813              N/A      0.00s\nGenerating new features...\n --> 20 new features were added.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # We can see the feature's fitness and description through the genetic_features attribute\natom.genetic_features_\n
                                                                                                                                                                                                                                                                                                                                                                  # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features_ Out[16]: name description fitness 0 x23 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 1 x24 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 2 x25 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 3 x26 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 4 x27 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 5 x28 mul(add(Cloud3pm, add(Cloud3pm, mul(add(WindGu... 0.541322 6 x29 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 7 x30 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 8 x31 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.540696 9 x32 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 10 x33 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.540674 11 x34 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 12 x35 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 13 x36 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 14 x37 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 15 x38 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 16 x39 mul(add(WindGustSpeed, add(Humidity3pm, Rainfa... 0.539923 17 x40 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.539923 18 x41 mul(mul(add(Cloud3pm, add(Cloud3pm, mul(Humidi... 0.539923 19 x42 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539909 In\u00a0[17]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Fit the model again\natom.run(\"LGB_gfg\", metric=\"auc\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Fit the model again atom.run(\"LGB_gfg\", metric=\"auc\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LGB_gfg\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9857\nTest evaluation --> auc: 0.8558\nTime elapsed: 1.044s\n-------------------------------------------------\nTime: 1.044s\n\n\nFinal results ==================== >>\nTotal time: 1.227s\n-------------------------------------\nLightGBM --> auc: 0.8558\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the whole pipeline\natom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the whole pipeline atom.plot_pipeline() In\u00a0[19]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use atom's plots to compare the three models\natom.plot_roc(rows=\"test+train\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Use atom's plots to compare the three models atom.plot_roc(rows=\"test+train\") In\u00a0[23]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # To compare other plots it might be useful to use a canvas\nwith atom.canvas(1, 2, figsize=(1800, 800)):\n    atom.lgb_dfs.plot_roc(rows=\"test+train\")\n    atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\")\n
                                                                                                                                                                                                                                                                                                                                                                  # To compare other plots it might be useful to use a canvas with atom.canvas(1, 2, figsize=(1800, 800)): atom.lgb_dfs.plot_roc(rows=\"test+train\") atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\") In\u00a0[21]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # We can check the feature importance with other plots as well\natom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12)\n
                                                                                                                                                                                                                                                                                                                                                                  # We can check the feature importance with other plots as well atom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12) In\u00a0[24]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)"}, {"location": "examples/feature_engineering/#example-feature-engineering", "title": "Example: Feature engineering\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use automated feature generation to improve a model's performance.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/feature_engineering/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/feature_engineering/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/feature_engineering/#deep-feature-synthesis", "title": "Deep Feature Synthesis\u00b6", "text": ""}, {"location": "examples/feature_engineering/#genetic-feature-generation", "title": "Genetic Feature Generation\u00b6", "text": ""}, {"location": "examples/feature_engineering/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/getting_started/", "title": "Getting started", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  import pandas as pd\nfrom atom import ATOMClassifier\n\n# Load the Australian Weather dataset\nX = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\")\n
                                                                                                                                                                                                                                                                                                                                                                  import pandas as pd from atom import ATOMClassifier # Load the Australian Weather dataset X = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\") In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (1000, 22)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 176.13 kB\nScaled: False\nMissing values: 2260 (10.3%)\nCategorical features: 5 (23.8%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \natom.encode(strategy=\"Target\", max_onehot=8)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"Target\", max_onehot=8)
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Imputing 8 missing values with median (11.6) in feature MinTemp.\n --> Imputing 2 missing values with median (22.3) in feature MaxTemp.\n --> Imputing 12 missing values with median (0.0) in feature Rainfall.\n --> Imputing 425 missing values with median (4.8) in feature Evaporation.\n --> Imputing 480 missing values with median (8.55) in feature Sunshine.\n --> Imputing 59 missing values with most_frequent (N) in feature WindGustDir.\n --> Imputing 59 missing values with median (37.0) in feature WindGustSpeed.\n --> Imputing 90 missing values with most_frequent (N) in feature WindDir9am.\n --> Imputing 28 missing values with most_frequent (SW) in feature WindDir3pm.\n --> Imputing 10 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 19 missing values with median (17.0) in feature WindSpeed3pm.\n --> Imputing 17 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 31 missing values with median (51.0) in feature Humidity3pm.\n --> Imputing 89 missing values with median (1017.8) in feature Pressure9am.\n --> Imputing 87 missing values with median (1015.2) in feature Pressure3pm.\n --> Imputing 383 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 412 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 11 missing values with median (16.5) in feature Temp9am.\n --> Imputing 26 missing values with median (20.7) in feature Temp3pm.\n --> Imputing 12 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 49 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LDA, AdaB\nMetric: auc\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.162s |  0.162s | COMPLETE |\n| 1     |     svd |       nan |  0.8445 |   0.8807 |     0.147s |  0.309s | COMPLETE |\n| 2     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.310s | COMPLETE |\n| 3     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.311s | COMPLETE |\n| 4     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.312s | COMPLETE |\n| 5     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 6     |     svd |       nan |  0.8445 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 7     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.313s | COMPLETE |\n| 8     |   eigen |       0.5 |  0.8417 |   0.8807 |     0.143s |  0.456s | COMPLETE |\n| 9     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.457s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 0\nBest parameters:\n --> solver: eigen\n --> shrinkage: 0.9\nBest evaluation --> auc: 0.8807\nTime elapsed: 0.457s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.8381\nTest evaluation --> auc: 0.8037\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.482s\n\n\nRunning hyperparameter tuning for AdaBoost...\n| trial | n_estimators | learning_rate | algorithm |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |           90 |        0.4088 |   SAMME.R |  0.8002 |   0.8002 |     0.331s |  0.331s | COMPLETE |\n| 1     |          190 |        0.1019 |   SAMME.R |  0.8294 |   0.8294 |     0.540s |  0.871s | COMPLETE |\n| 2     |          260 |         0.243 |   SAMME.R |   0.754 |   0.8294 |     0.645s |  1.515s | COMPLETE |\n| 3     |          490 |         0.041 |   SAMME.R |  0.7953 |   0.8294 |     1.105s |  2.620s | COMPLETE |\n| 4     |          210 |        0.1604 |     SAMME |  0.7969 |   0.8294 |     0.527s |  3.148s | COMPLETE |\n| 5     |          310 |        0.1504 |     SAMME |  0.7988 |   0.8294 |     0.696s |  3.843s | COMPLETE |\n| 6     |          380 |         2.445 |     SAMME |  0.5978 |   0.8294 |     0.830s |  4.674s | COMPLETE |\n| 7     |          100 |        0.9151 |     SAMME |  0.8372 |   0.8372 |     0.328s |  5.002s | COMPLETE |\n| 8     |          350 |        8.9334 |     SAMME |  0.6751 |   0.8372 |     0.786s |  5.787s | COMPLETE |\n| 9     |          450 |        0.1974 |     SAMME |    0.82 |   0.8372 |     0.969s |  6.757s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 7\nBest parameters:\n --> n_estimators: 100\n --> learning_rate: 0.9151\n --> algorithm: SAMME\nBest evaluation --> auc: 0.8372\nTime elapsed: 6.757s\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9133\nTest evaluation --> auc: 0.8353\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 6.989s\n\n\nFinal results ==================== >>\nTotal time: 9.134s\n-------------------------------------\nLinearDiscriminantAnalysis --> auc: 0.8037\nAdaBoost                   --> auc: 0.8353 !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate() Out[5]: accuracy ap ba f1 jaccard mcc precision recall auc LDA 0.785 0.5888 0.7533 0.5825 0.4110 0.4542 0.5000 0.6977 0.8037 AdaB 0.820 0.5801 0.7165 0.5610 0.3898 0.4490 0.5897 0.5349 0.8353"}, {"location": "examples/getting_started/#example-getting-started", "title": "Example: Getting started\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to get started with the atom-ml library.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/holdout_set/", "title": "Holdout set", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom specifying a fraction of the dataset for holdout\natom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom specifying a fraction of the dataset for holdout atom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (56877, 22)\nTrain set size: 42658\nTest set size: 14219\nHoldout set size: 14219\n-------------------------------------\nMemory: 10.01 MB\nScaled: False\nMissing values: 126822 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 15 (0.0%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The test and holdout fractions are split after subsampling the dataset\n# Also note that the holdout data set is not a part of atom's dataset\nprint(\"Length loaded data:\", len(X))\nprint(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))\n
                                                                                                                                                                                                                                                                                                                                                                  # The test and holdout fractions are split after subsampling the dataset # Also note that the holdout data set is not a part of atom's dataset print(\"Length loaded data:\", len(X)) print(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))
                                                                                                                                                                                                                                                                                                                                                                  Length loaded data: 142193\nLength dataset + holdout: 71096\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.impute()\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.impute() atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 258 samples due to missing values in feature MinTemp.\n --> Dropping 127 samples due to missing values in feature MaxTemp.\n --> Dropping 553 samples due to missing values in feature Rainfall.\n --> Dropping 24308 samples due to missing values in feature Evaporation.\n --> Dropping 27187 samples due to missing values in feature Sunshine.\n --> Dropping 3739 samples due to missing values in feature WindGustDir.\n --> Dropping 3712 samples due to missing values in feature WindGustSpeed.\n --> Dropping 3995 samples due to missing values in feature WindDir9am.\n --> Dropping 1508 samples due to missing values in feature WindDir3pm.\n --> Dropping 539 samples due to missing values in feature WindSpeed9am.\n --> Dropping 1077 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 706 samples due to missing values in feature Humidity9am.\n --> Dropping 1447 samples due to missing values in feature Humidity3pm.\n --> Dropping 5610 samples due to missing values in feature Pressure9am.\n --> Dropping 5591 samples due to missing values in feature Pressure3pm.\n --> Dropping 21520 samples due to missing values in feature Cloud9am.\n --> Dropping 22921 samples due to missing values in feature Cloud3pm.\n --> Dropping 365 samples due to missing values in feature Temp9am.\n --> Dropping 1106 samples due to missing values in feature Temp3pm.\n --> Dropping 553 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Unlike train and test, the holdout data set is not transformed until used for predictions\natom.holdout\n
                                                                                                                                                                                                                                                                                                                                                                  # Unlike train and test, the holdout data set is not transformed until used for predictions atom.holdout Out[6]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 22540 NorahHead 15.8 23.7 0.4 NaN NaN SSW 50.0 NW NaN ... 79.0 80.0 1012.4 1009.6 NaN NaN 18.4 18.9 No 0 22541 Brisbane 13.0 24.1 0.0 3.2 3.6 W 24.0 SW WSW ... 53.0 27.0 1019.9 1015.9 7.0 8.0 17.3 22.1 No 0 22542 MountGambier 14.7 36.2 0.0 7.2 12.5 S 33.0 N SSW ... 52.0 27.0 1018.8 1017.4 7.0 2.0 25.2 35.4 No 0 22543 Launceston 12.3 21.4 0.0 NaN NaN NNW 52.0 NNW NNW ... 62.0 60.0 NaN NaN 5.0 8.0 16.2 20.4 No 0 22544 MountGinini 3.2 10.0 0.0 NaN NaN WSW 52.0 WSW WSW ... 97.0 95.0 NaN NaN NaN NaN 6.5 8.4 No 0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 36754 MountGinini 1.6 4.4 0.0 NaN NaN E 52.0 E E ... 100.0 100.0 NaN NaN NaN NaN 2.7 2.6 No 1 36755 WaggaWagga 9.9 21.8 0.0 4.6 5.7 WSW 35.0 S SW ... 57.0 36.0 1015.5 1013.7 7.0 7.0 17.0 21.3 No 0 36756 Walpole 8.8 16.3 0.8 NaN NaN NNW 37.0 NNE N ... 84.0 79.0 1018.4 1013.5 NaN NaN 11.0 14.6 No 1 36757 Dartmoor 8.7 15.5 2.0 1.4 5.4 S 30.0 WSW SSW ... 100.0 94.0 1018.6 1020.0 NaN NaN 12.9 12.8 Yes 0 36758 SydneyAirport 16.8 22.6 8.4 5.0 3.8 S 57.0 WNW S ... 79.0 75.0 1013.2 1013.7 8.0 6.0 17.1 18.8 Yes 0

                                                                                                                                                                                                                                                                                                                                                                  14219 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"GNB\", \"LR\", \"RF\"])\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"GNB\", \"LR\", \"RF\"])
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: GNB, LR, RF\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.604\nTest evaluation --> f1: 0.6063\nTime elapsed: 0.209s\n-------------------------------------------------\nTime: 0.209s\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6188\nTest evaluation --> f1: 0.6162\nTime elapsed: 0.323s\n-------------------------------------------------\nTime: 0.323s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1: 1.0\nTest evaluation --> f1: 0.6084\nTime elapsed: 4.533s\n-------------------------------------------------\nTime: 4.533s\n\n\nFinal results ==================== >>\nTotal time: 5.734s\n-------------------------------------\nGaussianNB         --> f1: 0.6063\nLogisticRegression --> f1: 0.6162 !\nRandomForest       --> f1: 0.6084 ~\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_prc()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_prc() In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Based on the results on the test set, we select the best model for further tuning\natom.run(\"lr_tuned\", n_trials=10)\n
                                                                                                                                                                                                                                                                                                                                                                  # Based on the results on the test set, we select the best model for further tuning atom.run(\"lr_tuned\", n_trials=10)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR_tuned\nMetric: f1\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |    None |  0.1893 |     sag |      540 |      0.4 |  0.6096 |  0.6096 |     0.797s |  0.797s | COMPLETE |\n| 1     |      l2 |  0.6275 | newto.. |      150 |      0.7 |  0.6101 |  0.6101 |     0.637s |  1.433s | COMPLETE |\n| 2     |      l1 |  0.7457 | libli.. |      740 |      0.7 |  0.6114 |  0.6114 |     0.815s |  2.248s | COMPLETE |\n| 3     |      l2 |  0.0759 | newto.. |      290 |      0.4 |  0.6204 |  0.6204 |     0.634s |  2.882s | COMPLETE |\n| 4     |      l2 |  0.2122 | newto.. |      730 |      0.9 |  0.6273 |  0.6273 |     0.635s |  3.516s | COMPLETE |\n| 5     |      l2 |  0.0017 |   lbfgs |      260 |      1.0 |   0.589 |  0.6273 |     0.581s |  4.097s | COMPLETE |\n| 6     |      l2 |  0.0137 |     sag |      130 |      0.4 |  0.6092 |  0.6273 |     0.615s |  4.711s | COMPLETE |\n| 7     |    None |  0.0014 |     sag |      640 |      0.1 |  0.5909 |  0.6273 |     0.725s |  5.436s | COMPLETE |\n| 8     |      l2 |  0.0224 |     sag |      500 |      1.0 |  0.6226 |  0.6273 |     0.653s |  6.089s | COMPLETE |\n| 9     |      l1 |  0.1594 |    saga |      630 |      0.2 |  0.6236 |  0.6273 |     0.810s |  6.898s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> penalty: l2\n --> C: 0.2122\n --> solver: newton-cg\n --> max_iter: 730\n --> l1_ratio: 0.9\nBest evaluation --> f1: 0.6273\nTime elapsed: 6.898s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6188\nTest evaluation --> f1: 0.6172\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 7.251s\n\n\nFinal results ==================== >>\nTotal time: 7.461s\n-------------------------------------\nLogisticRegression --> f1: 0.6172\n

                                                                                                                                                                                                                                                                                                                                                                  We already used the test set to choose the best model for futher tuning, so this set is no longer truly independent. Although it may not be directly visible in the results, using the test set now to evaluate the tuned LR model would be a mistake, since it carries a bias. For this reason, we have set apart an extra, indepedent set to validate the final model: the holdout set. If we are not going to use the test set for validation, we might as well use it to train the model and so optimize the use of the available data. Use the full_train method for this.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Re-train the model on the full dataset (train + test) \natom.lr_tuned.full_train()\n
                                                                                                                                                                                                                                                                                                                                                                  # Re-train the model on the full dataset (train + test) atom.lr_tuned.full_train()
                                                                                                                                                                                                                                                                                                                                                                  Fit ---------------------------------------------\nTrain evaluation --> f1: 0.6185\nTest evaluation --> f1: 0.6185\nTime elapsed: 0.717s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Evaluate on the holdout set\natom.lr_tuned.evaluate(rows=\"holdout\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Evaluate on the holdout set atom.lr_tuned.evaluate(rows=\"holdout\") Out[11]:
                                                                                                                                                                                                                                                                                                                                                                  accuracy     0.8577\nap           0.7473\nba           0.7480\nf1           0.6352\njaccard      0.4654\nmcc          0.5606\nprecision    0.7559\nrecall       0.5477\nauc          0.8873\nName: LR_tuned, dtype: float64
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")"}, {"location": "examples/holdout_set/#example-holdout-set", "title": "Example: Holdout set\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows when and how to use ATOM's holdout set in an exploration pipeline.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/holdout_set/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/holdout_set/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/holdout_set/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/", "title": "Hyperparameter tuning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.distributions import IntDistribution\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from optuna.distributions import IntDistribution from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 4 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Train a MultiLayerPerceptron model on two metrics\n# using a custom number of hidden layers\natom.run(\n    models=\"MLP\",\n    metric=[\"f1\", \"ap\"],\n    n_trials=10,\n    est_params={\"activation\": \"relu\"},\n    ht_params={\n        \"distributions\": {\n            \"hidden_layer_1\": IntDistribution(2, 4),\n            \"hidden_layer_2\": IntDistribution(10, 20),\n            \"hidden_layer_3\": IntDistribution(10, 20),\n            \"hidden_layer_4\": IntDistribution(2, 4),\n        }\n    }\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Train a MultiLayerPerceptron model on two metrics # using a custom number of hidden layers atom.run( models=\"MLP\", metric=[\"f1\", \"ap\"], n_trials=10, est_params={\"activation\": \"relu\"}, ht_params={ \"distributions\": { \"hidden_layer_1\": IntDistribution(2, 4), \"hidden_layer_2\": IntDistribution(10, 20), \"hidden_layer_3\": IntDistribution(10, 20), \"hidden_layer_4\": IntDistribution(2, 4), } } )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: MLP\nMetric: f1, ap\n\n\nRunning hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |              3 |             17 |             10 |              2 |  0.9464 |  0.9464 |  0.9844 |  0.9844 |     9.139s |  9.139s | COMPLETE |\n| 1     |              2 |             11 |             12 |              3 |  0.9744 |  0.9744 |  0.9991 |  0.9991 |    11.466s | 20.605s | COMPLETE |\n| 2     |              3 |             15 |             14 |              4 |  0.9915 |  0.9915 |  0.9978 |  0.9991 |     8.570s | 29.175s | COMPLETE |\n| 3     |              2 |             19 |             10 |              4 |  0.9655 |  0.9915 |  0.9878 |  0.9991 |     9.208s | 38.383s | COMPLETE |\n| 4     |              3 |             16 |             11 |              2 |  0.9661 |  0.9915 |  0.9981 |  0.9991 |     0.657s | 39.039s | COMPLETE |\n| 5     |              4 |             20 |             13 |              4 |  0.9739 |  0.9915 |  0.9989 |  0.9991 |     0.623s | 39.662s | COMPLETE |\n| 6     |              4 |             19 |             10 |              2 |  0.9828 |  0.9915 |  0.9907 |  0.9991 |     0.601s | 40.263s | COMPLETE |\n| 7     |              2 |             19 |             11 |              3 |  0.7733 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 40.863s | COMPLETE |\n| 8     |              4 |             15 |             17 |              2 |  0.9915 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 41.464s | COMPLETE |\n| 9     |              4 |             19 |             10 |              4 |  0.9828 |  0.9915 |  0.9822 |  0.9997 |     0.599s | 42.062s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 8\nBest parameters:\n --> hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --> f1: 0.9915   ap: 0.9997\nTime elapsed: 42.062s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.9965   ap: 0.9991\nTest evaluation --> f1: 0.9718   ap: 0.9938\nTime elapsed: 1.515s\n-------------------------------------------------\nTime: 43.578s\n\n\nFinal results ==================== >>\nTotal time: 43.815s\n-------------------------------------\nMultiLayerPerceptron --> f1: 0.9718   ap: 0.9938\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # For multi-metric runs, the selected best trial is the first in the Pareto front\natom.mlp.best_trial\n
                                                                                                                                                                                                                                                                                                                                                                  # For multi-metric runs, the selected best trial is the first in the Pareto front atom.mlp.best_trial Out[5]:
                                                                                                                                                                                                                                                                                                                                                                  FrozenTrial(number=8, state=1, values=[0.9914529914529915, 0.9997077732320282], datetime_start=datetime.datetime(2023, 11, 4, 19, 13, 50, 113304), datetime_complete=datetime.datetime(2023, 11, 4, 19, 13, 50, 713850), params={'hidden_layer_1': 4, 'hidden_layer_2': 15, 'hidden_layer_3': 17, 'hidden_layer_4': 2}, user_attrs={'estimator': MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2), random_state=1)}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'hidden_layer_1': IntDistribution(high=4, log=False, low=2, step=1), 'hidden_layer_2': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_3': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_4': IntDistribution(high=4, log=False, low=2, step=1)}, trial_id=8, value=None)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_pareto_front()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_pareto_front() In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # If you are unhappy with the results, it's possible to conitnue the study\natom.mlp.hyperparameter_tuning(n_trials=5)\n
                                                                                                                                                                                                                                                                                                                                                                  # If you are unhappy with the results, it's possible to conitnue the study atom.mlp.hyperparameter_tuning(n_trials=5)
                                                                                                                                                                                                                                                                                                                                                                  Running hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 10    |              4 |             18 |             13 |              4 |  0.9831 |  0.9915 |  0.9997 |  0.9997 |     0.673s | 42.735s | COMPLETE |\n| 11    |              2 |             14 |             19 |              2 |  0.9421 |  0.9915 |  0.9899 |  0.9997 |     0.604s | 43.339s | COMPLETE |\n| 12    |              2 |             11 |             10 |              4 |  0.7733 |  0.9915 |    0.99 |  0.9997 |     0.617s | 43.955s | COMPLETE |\n| 13    |              2 |             12 |             15 |              2 |  0.9558 |  0.9915 |  0.9985 |  0.9997 |     0.595s | 44.550s | COMPLETE |\n| 14    |              3 |             11 |             16 |              4 |  0.7733 |  0.9915 |  0.9721 |  0.9997 |     0.663s | 45.212s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 8\nBest parameters:\n --> hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --> f1: 0.9915   ap: 0.9997\nTime elapsed: 45.212s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The trials attribute gives an overview of the trial results\natom.mlp.trials\n
                                                                                                                                                                                                                                                                                                                                                                  # The trials attribute gives an overview of the trial results atom.mlp.trials Out[8]: hidden_layer_1 hidden_layer_2 hidden_layer_3 hidden_layer_4 estimator f1 best_f1 ap best_ap time_trial time_ht state trial 0 3 17 10 2 MLPClassifier(hidden_layer_sizes=(3, 17, 10, 2... 0.946429 0.991453 0.984402 0.999708 9.138911 9.138911 COMPLETE 1 2 11 12 3 MLPClassifier(hidden_layer_sizes=(2, 11, 12, 3... 0.974359 0.991453 0.999128 0.999708 11.466475 20.605386 COMPLETE 2 3 15 14 4 MLPClassifier(hidden_layer_sizes=(3, 15, 14, 4... 0.991453 0.991453 0.997842 0.999708 8.569545 29.174931 COMPLETE 3 2 19 10 4 MLPClassifier(hidden_layer_sizes=(2, 19, 10, 4... 0.965517 0.991453 0.987805 0.999708 9.207920 38.382851 COMPLETE 4 3 16 11 2 MLPClassifier(hidden_layer_sizes=(3, 16, 11, 2... 0.966102 0.991453 0.998086 0.999708 0.656597 39.039448 COMPLETE 5 4 20 13 4 MLPClassifier(hidden_layer_sizes=(4, 20, 13, 4... 0.973913 0.991453 0.998855 0.999708 0.622566 39.662014 COMPLETE 6 4 19 10 2 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 2... 0.982759 0.991453 0.990748 0.999708 0.600547 40.262561 COMPLETE 7 2 19 11 3 MLPClassifier(hidden_layer_sizes=(2, 19, 11, 3... 0.773333 0.991453 0.999708 0.999708 0.600546 40.863107 COMPLETE 8 4 15 17 2 MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2... 0.991453 0.991453 0.999708 0.999708 0.600546 41.463653 COMPLETE 9 4 19 10 4 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 4... 0.982759 0.991453 0.982168 0.999708 0.598815 42.062468 COMPLETE 10 4 18 13 4 MLPClassifier(hidden_layer_sizes=(4, 18, 13, 4... 0.983051 0.991453 0.999708 0.999708 0.672611 42.735079 COMPLETE 11 2 14 19 2 MLPClassifier(hidden_layer_sizes=(2, 14, 19, 2... 0.942149 0.991453 0.989914 0.999708 0.603549 43.338628 COMPLETE 12 2 11 10 4 MLPClassifier(hidden_layer_sizes=(2, 11, 10, 4... 0.773333 0.991453 0.990024 0.999708 0.616561 43.955189 COMPLETE 13 2 12 15 2 MLPClassifier(hidden_layer_sizes=(2, 12, 15, 2... 0.955752 0.991453 0.998518 0.999708 0.594541 44.549730 COMPLETE 14 3 11 16 4 MLPClassifier(hidden_layer_sizes=(3, 11, 16, 4... 0.773333 0.991453 0.972070 0.999708 0.662602 45.212332 COMPLETE In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Select a custom best trial...\natom.mlp.best_trial = 2\n\n# ...and check that the best parameters are now those in the selected trial\natom.mlp.best_params\n
                                                                                                                                                                                                                                                                                                                                                                  # Select a custom best trial... atom.mlp.best_trial = 2 # ...and check that the best parameters are now those in the selected trial atom.mlp.best_params Out[9]:
                                                                                                                                                                                                                                                                                                                                                                  {'hidden_layer_sizes': (3, 15, 14, 4)}
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Lastly, fit the model on the complete training set \n# using the new combination of hyperparameters\natom.mlp.fit()\n
                                                                                                                                                                                                                                                                                                                                                                  # Lastly, fit the model on the complete training set # using the new combination of hyperparameters atom.mlp.fit()
                                                                                                                                                                                                                                                                                                                                                                  Fit ---------------------------------------------\nTrain evaluation --> f1: 0.9983   ap: 0.9998\nTest evaluation --> f1: 0.9718   ap: 0.9947\nTime elapsed: 3.048s\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_trials()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_trials() In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_parallel_coordinate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_parallel_coordinate()"}, {"location": "examples/hyperparameter_tuning/#example-hyperparameter-tuning", "title": "Example: Hyperparameter tuning\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows an advanced example on how to optimize your model's hyperparameters for multi-metric runs.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/hyperparameter_tuning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/", "title": "Imbalanced datasets", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create a mock imbalanced dataset\nX, y = make_classification(\n    n_samples=5000,\n    n_features=30,\n    n_informative=20,\n    weights=(0.95,),\n    random_state=1,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create a mock imbalanced dataset X, y = make_classification( n_samples=5000, n_features=30, n_informative=20, weights=(0.95,), random_state=1, ) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (5000, 31)\nTrain set size: 4000\nTest set size: 1000\n-------------------------------------\nMemory: 1.24 MB\nScaled: False\nOutlier values: 570 (0.5%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the data. Note that, since the input wasn't\n# a dataframe, atom has given default names to the columns.\natom.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the data. Note that, since the input wasn't # a dataframe, atom has given default names to the columns. atom.head() Out[4]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x21 x22 x23 x24 x25 x26 x27 x28 x29 target 0 -0.535760 -2.426045 1.256836 0.374501 -3.241958 -1.239468 -0.208750 -6.015995 3.698669 0.112512 ... 0.044302 -1.935727 10.870353 0.286755 -2.416507 0.556990 -1.522635 3.719201 1.449135 0 1 -3.311935 -3.149920 -0.801252 -2.644414 -0.704889 -3.312256 0.714515 2.992345 5.056910 3.036775 ... 2.224359 0.451273 -1.822108 -1.435801 0.036132 -1.364583 1.215663 5.232161 1.408798 0 2 3.821199 1.328129 -1.000720 -13.151697 0.254253 1.263636 -1.088451 4.924264 -1.225646 -6.974824 ... 3.541222 1.686667 -13.763703 -1.321256 1.677687 0.774966 -5.067689 4.663386 -1.714186 0 3 5.931126 3.338830 0.545906 2.296355 -3.941088 3.527252 -0.158770 3.138381 -0.927460 -1.642079 ... -3.634442 7.853176 -8.457598 0.000490 -2.612756 -1.138206 0.497150 4.351289 -0.321748 0 4 -2.829472 -1.227185 -0.751892 3.056106 -1.988920 -2.219184 -0.075882 5.790102 -2.786671 2.023458 ... 4.057954 1.178564 -15.028187 1.627140 -1.093587 -0.422655 1.777011 6.660638 -2.553723 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 31 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's start reducing the number of features\natom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's start reducing the number of features atom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 12 features from the dataset.\n   --> Dropping feature x1 (rank 8).\n   --> Dropping feature x2 (rank 11).\n   --> Dropping feature x4 (rank 3).\n   --> Dropping feature x6 (rank 16).\n   --> Dropping feature x7 (rank 14).\n   --> Dropping feature x10 (rank 19).\n   --> Dropping feature x12 (rank 13).\n   --> Dropping feature x13 (rank 12).\n   --> Dropping feature x14 (rank 9).\n   --> Dropping feature x16 (rank 10).\n   --> Dropping feature x18 (rank 17).\n   --> Dropping feature x19 (rank 2).\n   --> Dropping feature x20 (rank 4).\n   --> Dropping feature x22 (rank 7).\n   --> Dropping feature x23 (rank 5).\n   --> Dropping feature x24 (rank 18).\n   --> Dropping feature x25 (rank 6).\n   --> Dropping feature x26 (rank 15).\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Fit a model directly on the imbalanced data\natom.run(\"RF\", metric=\"ba\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Fit a model directly on the imbalanced data atom.run(\"RF\", metric=\"ba\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.5556\nTime elapsed: 2.497s\n-------------------------------------------------\nTime: 2.497s\n\n\nFinal results ==================== >>\nTotal time: 2.568s\n-------------------------------------\nRandomForest --> ba: 0.5556 ~\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The transformer and the models have been added to the branch\natom.branch\n
                                                                                                                                                                                                                                                                                                                                                                  # The transformer and the models have been added to the branch atom.branch Out[8]:
                                                                                                                                                                                                                                                                                                                                                                  Branch(main)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create a new branch for oversampling\natom.branch = \"oversample\"\n
                                                                                                                                                                                                                                                                                                                                                                  # Create a new branch for oversampling atom.branch = \"oversample\"
                                                                                                                                                                                                                                                                                                                                                                  Successfully created new branch: oversample.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Perform oversampling of the minority class\natom.balance(strategy=\"smote\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Perform oversampling of the minority class atom.balance(strategy=\"smote\")
                                                                                                                                                                                                                                                                                                                                                                  Oversampling with SMOTE...\n --> Adding 3570 samples to class 1.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.classes  # Check the balanced training set!\n
                                                                                                                                                                                                                                                                                                                                                                  atom.classes # Check the balanced training set! Out[11]: dataset train test 0 4731 3785 946 1 3839 3785 54 In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Train another model on the new branch. Add a tag after \n# the model's acronym to distinguish it from the first model\natom.run(\"rf_os\")  # os for oversample\n
                                                                                                                                                                                                                                                                                                                                                                  # Train another model on the new branch. Add a tag after # the model's acronym to distinguish it from the first model atom.run(\"rf_os\") # os for oversample
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF_os\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.7672\nTime elapsed: 4.136s\n-------------------------------------------------\nTime: 4.136s\n\n\nFinal results ==================== >>\nTotal time: 4.248s\n-------------------------------------\nRandomForest --> ba: 0.7672 ~\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create the undersampling branch\n# Split from master to not adopt the oversmapling transformer\natom.branch = \"undersample_from_main\"\n
                                                                                                                                                                                                                                                                                                                                                                  # Create the undersampling branch # Split from master to not adopt the oversmapling transformer atom.branch = \"undersample_from_main\"
                                                                                                                                                                                                                                                                                                                                                                  Successfully created new branch: undersample.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.classes  # In this branch, the data is still imbalanced\n
                                                                                                                                                                                                                                                                                                                                                                  atom.classes # In this branch, the data is still imbalanced Out[15]: dataset train test 0 4731 3785 946 1 269 215 54 In\u00a0[16]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Perform undersampling of the majority class\natom.balance(strategy=\"NearMiss\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Perform undersampling of the majority class atom.balance(strategy=\"NearMiss\")
                                                                                                                                                                                                                                                                                                                                                                  Undersampling with NearMiss...\n --> Removing 3570 samples from class 0.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"rf_us\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\"rf_us\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF_us\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> ba: 1.0\nTest evaluation --> ba: 0.6706\nTime elapsed: 0.285s\n-------------------------------------------------\nTime: 0.285s\n\n\nFinal results ==================== >>\nTotal time: 0.321s\n-------------------------------------\nRandomForest --> ba: 0.6706 ~\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check that the branch only contains the desired transformers \natom.branch\n
                                                                                                                                                                                                                                                                                                                                                                  # Check that the branch only contains the desired transformers atom.branch Out[18]:
                                                                                                                                                                                                                                                                                                                                                                  Branch(undersample)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[19]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the complete pipeline\natom.plot_pipeline()\n
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the complete pipeline atom.plot_pipeline() In\u00a0[20]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate() Out[20]: accuracy ap ba f1 jaccard mcc precision recall auc RF 0.952 0.6562 0.5556 0.2000 0.1111 0.3252 1.000 0.1111 0.9107 RF_os 0.956 0.6215 0.7672 0.5769 0.4054 0.5542 0.600 0.5556 0.9251 RF_us 0.509 0.3687 0.6706 0.1578 0.0857 0.1545 0.087 0.8519 0.8258 In\u00a0[21]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_prc()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_prc() In\u00a0[22]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_roc()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_roc()"}, {"location": "examples/imbalanced_datasets/#example-imbalanced-datasets", "title": "Example: Imbalanced datasets\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how ATOM can help you handle imbalanced datasets. We will evaluate the performance of three different Random Forest models: one trained directly on the imbalanced dataset, one trained on an oversampled dataset and the last one trained on an undersampled dataset.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/imbalanced_datasets/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#oversampling", "title": "Oversampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#undersampling", "title": "Undersampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/in_training_validation/", "title": "In-training validation", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Not all models support in-training validation\n# You can chek which ones do using the available_models method\ndf = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\ndf[df[\"has_validation\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # Not all models support in-training validation # You can chek which ones do using the available_models method df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]] df[df[\"has_validation\"]] Out[4]: acronym model has_validation 3 CatB CatBoost True 15 LGB LightGBM True 19 MLP MultiLayerPerceptron True 21 PA PassiveAggressive True 22 Perc Perceptron True 27 SGD StochasticGradientDescent True 29 XGB XGBoost True In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Run the models normally\natom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Run the models normally atom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: MLP, LGB\nMetric: auc\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --> auc: 0.9997\nTest evaluation --> auc: 0.9936\nTime elapsed: 1.821s\n-------------------------------------------------\nTime: 1.821s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> auc: 1.0\nTest evaluation --> auc: 0.9775\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 0.352s\n\n\nFinal results ==================== >>\nTotal time: 2.236s\n-------------------------------------\nMultiLayerPerceptron --> auc: 0.9936 !\nLightGBM             --> auc: 0.9775\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_evals(title=\"In-training validation scores\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_evals(title=\"In-training validation scores\") In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Plot the validation on the train and test set\natom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Plot the validation on the train and test set atom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")"}, {"location": "examples/in_training_validation/#example-in-training-validation", "title": "Example: In-training validation\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to keep track of the model's performance during training.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/in_training_validation/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/in_training_validation/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/in_training_validation/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/memory_considerations/", "title": "Memory considerations", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport os\nimport tempfile\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import os import tempfile import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Define a temp directory to store the files in this example\ntempdir = tempfile.gettempdir()\n
                                                                                                                                                                                                                                                                                                                                                                  # Define a temp directory to store the files in this example tempdir = tempfile.gettempdir() In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  def get_size(filepath):\n    \"\"\"Return the size of the object in MB.\"\"\"\n    return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\"\n
                                                                                                                                                                                                                                                                                                                                                                  def get_size(filepath): \"\"\"Return the size of the object in MB.\"\"\" return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\" In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n

                                                                                                                                                                                                                                                                                                                                                                  Note that the datset takes ~25MB. We can reduce the size of the dataset using the shrink method, which reduces the dtypes to their smallest possible value.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.dtypes\n
                                                                                                                                                                                                                                                                                                                                                                  atom.dtypes Out[6]:
                                                                                                                                                                                                                                                                                                                                                                  Location          object\nMinTemp          float64\nMaxTemp          float64\nRainfall         float64\nEvaporation      float64\nSunshine         float64\nWindGustDir       object\nWindGustSpeed    float64\nWindDir9am        object\nWindDir3pm        object\nWindSpeed9am     float64\nWindSpeed3pm     float64\nHumidity9am      float64\nHumidity3pm      float64\nPressure9am      float64\nPressure3pm      float64\nCloud9am         float64\nCloud3pm         float64\nTemp9am          float64\nTemp3pm          float64\nRainToday         object\nRainTomorrow       int64\ndtype: object
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.shrink(str2cat=True)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.shrink(str2cat=True)
                                                                                                                                                                                                                                                                                                                                                                  The column dtypes are successfully converted.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.dtypes\n
                                                                                                                                                                                                                                                                                                                                                                  atom.dtypes Out[8]:
                                                                                                                                                                                                                                                                                                                                                                  Location         category\nMinTemp           Float32\nMaxTemp           Float32\nRainfall          Float32\nEvaporation       Float32\nSunshine          Float32\nWindGustDir      category\nWindGustSpeed       Int16\nWindDir9am       category\nWindDir3pm       category\nWindSpeed9am        Int16\nWindSpeed3pm         Int8\nHumidity9am          Int8\nHumidity3pm          Int8\nPressure9am       Float32\nPressure3pm       Float32\nCloud9am             Int8\nCloud3pm             Int8\nTemp9am           Float32\nTemp3pm           Float32\nRainToday        category\nRainTomorrow         Int8\ndtype: object
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's check the memory usage again...\n# Notice the huge drop!\natom.stats()\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's check the memory usage again... # Notice the huge drop! atom.stats()
                                                                                                                                                                                                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 9.67 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Now, we create some new branches to train models with different trasnformers\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Now, we create some new branches to train models with different trasnformers atom.impute() atom.encode() atom.run(\"LDA\") atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\") atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\")
                                                                                                                                                                                                                                                                                                                                                                  Fitting Imputer...\nImputing missing values...\n --> Dropping 637 samples due to missing values in feature MinTemp.\n --> Dropping 322 samples due to missing values in feature MaxTemp.\n --> Dropping 1406 samples due to missing values in feature Rainfall.\n --> Dropping 60843 samples due to missing values in feature Evaporation.\n --> Dropping 67816 samples due to missing values in feature Sunshine.\n --> Dropping 9330 samples due to missing values in feature WindGustDir.\n --> Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --> Dropping 10013 samples due to missing values in feature WindDir9am.\n --> Dropping 3778 samples due to missing values in feature WindDir3pm.\n --> Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --> Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --> Dropping 1774 samples due to missing values in feature Humidity9am.\n --> Dropping 3610 samples due to missing values in feature Humidity3pm.\n --> Dropping 14014 samples due to missing values in feature Pressure9am.\n --> Dropping 13981 samples due to missing values in feature Pressure3pm.\n --> Dropping 53657 samples due to missing values in feature Cloud9am.\n --> Dropping 57094 samples due to missing values in feature Cloud3pm.\n --> Dropping 904 samples due to missing values in feature Temp9am.\n --> Dropping 2726 samples due to missing values in feature Temp3pm.\n --> Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 26 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6213\nTest evaluation --> f1: 0.6341\nTime elapsed: 0.375s\n-------------------------------------------------\nTime: 0.375s\n\n\nFinal results ==================== >>\nTotal time: 0.613s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6341\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= >>\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6213\nTest evaluation --> f1: 0.6341\nTime elapsed: 0.390s\n-------------------------------------------------\nTime: 0.390s\n\n\nFinal results ==================== >>\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6341\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= >>\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6267\nTest evaluation --> f1: 0.6368\nTime elapsed: 0.369s\n-------------------------------------------------\nTime: 0.369s\n\n\nFinal results ==================== >>\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6368\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # If we save atom now, notice the size\n# This is because atom keeps a copy of every branch in memory\nfilename = tempdir + \"atom1\"\natom.save(filename)\nget_size(filename)\n
                                                                                                                                                                                                                                                                                                                                                                  # If we save atom now, notice the size # This is because atom keeps a copy of every branch in memory filename = tempdir + \"atom1\" atom.save(filename) get_size(filename)
                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                                                                                                                                                                                                  Out[11]:
                                                                                                                                                                                                                                                                                                                                                                  '34.92MB'

                                                                                                                                                                                                                                                                                                                                                                  To avoid large memory usages, set the memory parameter.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) atom.impute() atom.encode() atom.run(\"LDA\") atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\") atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\")
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= >>\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6233\nTest evaluation --> f1: 0.6248\nTime elapsed: 0.445s\n-------------------------------------------------\nTime: 0.445s\n\n\nFinal results ==================== >>\nTotal time: 0.708s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6248\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= >>\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6233\nTest evaluation --> f1: 0.6248\nTime elapsed: 0.454s\n-------------------------------------------------\nTime: 0.454s\n\n\nFinal results ==================== >>\nTotal time: 0.737s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6248\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= >>\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.6279\nTest evaluation --> f1: 0.6298\nTime elapsed: 0.447s\n-------------------------------------------------\nTime: 0.447s\n\n\nFinal results ==================== >>\nTotal time: 0.740s\n-------------------------------------\nLinearDiscriminantAnalysis --> f1: 0.6298\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # And now, it only takes a fraction of the previous size\n# This is because the data of inactive branches is now stored locally\nfilename = tempdir + \"atom2\"\natom.save(filename)\nget_size(filename)\n
                                                                                                                                                                                                                                                                                                                                                                  # And now, it only takes a fraction of the previous size # This is because the data of inactive branches is now stored locally filename = tempdir + \"atom2\" atom.save(filename) get_size(filename)
                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                                                                                                                                                                                                  Out[13]:
                                                                                                                                                                                                                                                                                                                                                                  '9.63MB'

                                                                                                                                                                                                                                                                                                                                                                  Additionnaly, repeated calls to the same transformers with the same data will use the cached results. Don't forget to specify the random_state parameter to ensure the data remains the exact same.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note the transformers are no longer fitted,\n# instead the results are immediately read from cache\natom.impute()\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  # Note the transformers are no longer fitted, # instead the results are immediately read from cache atom.impute() atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  Retrieving cached results for Imputer...\nRetrieving cached results for Encoder...\nEncoding categorical columns...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset\n
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset Out[16]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 0.075703 13.0 30.5 0.0 6.8 10.0 0.271668 59 0.312069 0.273733 ... 19 8 1013.599976 1008.0 0 2 19.6 29.9 0.0 0 1 0.245394 15.3 22.4 16.0 4.2 3.3 0.204934 39 0.236475 0.199626 ... 83 63 1025.5 1023.599976 6 6 16.9 21.1 1.0 1 2 0.262397 27.9 34.5 0.0 9.0 7.9 0.1737 72 0.236475 0.306935 ... 72 63 1009.0 1005.5 7 7 31.0 33.099998 0.0 1 3 0.239174 12.9 27.9 0.0 5.4 8.6 0.269421 39 0.256213 0.286159 ... 69 56 1023.400024 1019.799988 7 7 14.7 23.4 0.0 0 4 0.253089 7.4 14.3 0.8 2.8 4.0 0.210095 31 0.269333 0.167808 ... 84 62 1023.599976 1023.200012 4 7 9.0 13.6 0.0 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 56415 0.295559 23.9 28.1 0.0 2.6 7.7 0.241448 44 0.279553 0.259391 ... 86 79 1015.900024 1013.900024 7 7 25.799999 27.5 0.0 0 56416 0.217037 13.6 24.6 0.0 4.4 7.8 0.1737 39 0.193908 0.197102 ... 87 61 1023.200012 1022.599976 7 3 17.299999 21.4 0.0 0 56417 0.112176 16.299999 38.700001 0.0 10.2 13.4 0.1737 24 0.149795 0.168702 ... 29 8 1013.5 1010.299988 5 2 26.4 36.900002 0.0 0 56418 0.295559 11.5 19.200001 0.8 2.0 7.0 0.147458 22 0.13795 0.195807 ... 73 52 1021.299988 1018.799988 3 4 17.1 18.4 0.0 0 56419 0.403054 5.9 18.0 0.4 0.8 6.7 0.269421 26 0.312069 0.286159 ... 92 65 1028.0 1025.300049 3 2 9.4 16.6 0.0 0

                                                                                                                                                                                                                                                                                                                                                                  56420 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/memory_considerations/#example-memory-considerations", "title": "Example: Memory considerations\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use the memory parameter to make efficient use of the available memory.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/memory_considerations/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/memory_considerations/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/", "title": "Multi-metric runs", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\") # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 189 (0.6%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> OneHot-encoding feature Sex. Contains 3 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # For every step of the BO, both metrics are calculated,\n# but only the first is used for optimization!\natom.run(\n    models=[\"lsvm\", \"hGBM\"],\n    metric=(\"r2\", \"rmse\"),\n    n_trials=10,\n    n_bootstrap=6,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # For every step of the BO, both metrics are calculated, # but only the first is used for optimization! atom.run( models=[\"lsvm\", \"hGBM\"], metric=(\"r2\", \"rmse\"), n_trials=10, n_bootstrap=6, )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: lSVM, hGBM\nMetric: r2, rmse\n\n\nRunning hyperparameter tuning for LinearSVM...\n| trial |                    loss |       C |    dual |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | ----------------------- | ------- | ------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | squared_epsilon_insen.. |   0.001 |    True |  0.2887 |  0.2887 | -2.6528 |   -2.6528 |     0.043s |  0.043s | COMPLETE |\n| 1     | squared_epsilon_insen.. |  0.0534 |   False |  0.3862 |  0.3862 | -2.5926 |   -2.5926 |     0.043s |  0.086s | COMPLETE |\n| 2     | squared_epsilon_insen.. |  0.0105 |    True |   0.433 |   0.433 | -2.4084 |   -2.4084 |     0.054s |  0.140s | COMPLETE |\n| 3     |     epsilon_insensitive |  0.6215 |    True |  0.4022 |   0.433 | -2.5251 |   -2.4084 |     0.045s |  0.185s | COMPLETE |\n| 4     | squared_epsilon_insen.. |  0.0369 |   False |  0.4057 |   0.433 | -2.5477 |   -2.4084 |     0.040s |  0.225s | COMPLETE |\n| 5     |     epsilon_insensitive |  0.0016 |    True | -1.5344 |   0.433 | -5.0102 |   -2.4084 |     0.035s |  0.260s | COMPLETE |\n| 6     | squared_epsilon_insen.. | 61.5811 |   False |  0.4354 |  0.4354 | -2.3845 |   -2.3845 |     0.034s |  0.294s | COMPLETE |\n| 7     | squared_epsilon_insen.. |  14.898 |   False |  0.4925 |  0.4925 | -2.2628 |   -2.2628 |     0.035s |  0.329s | COMPLETE |\n| 8     |     epsilon_insensitive |  0.0252 |    True |  0.3695 |  0.4925 | -2.6178 |   -2.2628 |     0.035s |  0.364s | COMPLETE |\n| 9     | squared_epsilon_insen.. |  0.0294 |    True |  0.4767 |  0.4925 | -2.3896 |   -2.2628 |     0.044s |  0.408s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 7\nBest parameters:\n --> loss: squared_epsilon_insensitive\n --> C: 14.898\n --> dual: False\nBest evaluation --> r2: 0.4925   rmse: -2.2628\nTime elapsed: 0.408s\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.4592   rmse: -2.3795\nTest evaluation --> r2: 0.4584   rmse: -2.3369\nTime elapsed: 0.089s\nBootstrap ---------------------------------------\nEvaluation --> r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.592s\n\n\nRunning hyperparameter tuning for HistGradientBoosting...\n| trial |      loss | quantile | learning_rate | max_iter | max_leaf_nodes | max_depth | min_samples_leaf | l2_regularization |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | --------- | -------- | ------------- | -------- | -------------- | --------- | ---------------- | ----------------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | absolut.. |      0.1 |        0.0236 |      180 |             26 |        12 |               11 |               0.0 |  0.5373 |  0.5373 | -2.1398 |   -2.1398 |     0.968s |  0.968s | COMPLETE |\n| 1     |     gamma |      0.5 |         0.242 |      160 |             38 |         3 |               20 |               0.0 |   0.574 |   0.574 | -2.1598 |   -2.1398 |     0.160s |  1.128s | COMPLETE |\n| 2     |  quantile |      0.4 |        0.2448 |      210 |             12 |         3 |               25 |               0.3 |  0.4714 |   0.574 | -2.3253 |   -2.1398 |     0.422s |  1.550s | COMPLETE |\n| 3     |  quantile |      0.6 |         0.017 |      480 |             28 |        16 |               13 |               0.1 |  0.5712 |   0.574 | -2.1385 |   -2.1385 |     3.405s |  4.956s | COMPLETE |\n| 4     | squared.. |      1.0 |        0.2649 |       70 |             10 |        10 |               28 |               0.8 |  0.5561 |   0.574 | -2.2019 |   -2.1385 |     0.148s |  5.104s | COMPLETE |\n| 5     | squared.. |      0.1 |        0.0283 |      360 |             32 |         9 |               11 |               0.5 |  0.5464 |   0.574 | -2.1197 |   -2.1197 |     1.248s |  6.352s | COMPLETE |\n| 6     |  quantile |      0.4 |        0.1264 |      380 |             37 |        12 |               29 |               1.0 |  0.4416 |   0.574 | -2.3713 |   -2.1197 |     3.002s |  9.354s | COMPLETE |\n| 7     |     gamma |      0.6 |         0.678 |      330 |             25 |         6 |               12 |               0.8 |  0.4299 |   0.574 | -2.3984 |   -2.1197 |     0.739s | 10.092s | COMPLETE |\n| 8     | absolut.. |      0.9 |        0.0831 |      280 |             42 |        16 |               10 |               1.0 |  0.5242 |   0.574 | -2.2742 |   -2.1197 |     2.002s | 12.094s | COMPLETE |\n| 9     | absolut.. |      0.6 |        0.0373 |      300 |             40 |        13 |               17 |               0.8 |  0.5685 |   0.574 |   -2.17 |   -2.1197 |     1.859s | 13.953s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 5\nBest parameters:\n --> loss: squared_error\n --> quantile: 0.1\n --> learning_rate: 0.0283\n --> max_iter: 360\n --> max_leaf_nodes: 32\n --> max_depth: 9\n --> min_samples_leaf: 11\n --> l2_regularization: 0.5\nBest evaluation --> r2: 0.5464   rmse: -2.1197\nTime elapsed: 13.953s\nFit ---------------------------------------------\nTrain evaluation --> r2: 0.7959   rmse: -1.4619\nTest evaluation --> r2: 0.5479   rmse: -2.1351\nTime elapsed: 1.470s\nBootstrap ---------------------------------------\nEvaluation --> r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352\nTime elapsed: 7.930s\n-------------------------------------------------\nTime: 23.353s\n\n\nFinal results ==================== >>\nTotal time: 25.299s\n-------------------------------------\nLinearSVM            --> r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nHistGradientBoosting --> r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352 ~ !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check the robustness of the pipeline using cross-validation\natom.winner.cross_validate()\n
                                                                                                                                                                                                                                                                                                                                                                  # Check the robustness of the pipeline using cross-validation atom.winner.cross_validate()
                                                                                                                                                                                                                                                                                                                                                                  Applying cross-validation...\n
                                                                                                                                                                                                                                                                                                                                                                  Out[6]: train_r2 test_r2 train_rmse test_rmse time (s) 0 0.796038 0.541990 -1.453147 -2.196943 1.392266 1 0.794954 0.540424 -1.457709 -2.196179 1.436932 2 0.790722 0.505922 -1.492522 -2.153457 1.444314 3 0.785317 0.580703 -1.474827 -2.189902 1.432303 4 0.795872 0.547917 -1.461929 -2.135072 1.747591 mean 0.792581 0.543391 -1.468027 -2.174311 1.490681 std 0.004114 0.023780 0.014222 0.025330 0.129719 In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The columns in the results dataframe contain one for each metric\natom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # The columns in the results dataframe contain one for each metric atom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]] Out[8]: r2_ht r2_train r2_test rmse_ht rmse_train rmse_test lSVM 0.492530 0.4583 0.4552 -2.262754 -2.3815 -2.3439 hGBM 0.546368 0.7183 0.4971 -2.119672 -1.7173 -2.2518 In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Some plots allow us to choose the metric we want to show\nwith atom.canvas():\n    atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")\n    atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Some plots allow us to choose the metric we want to show with atom.canvas(): atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\") atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\") In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_results(metric=\"r2\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_results(metric=\"r2\")"}, {"location": "examples/multi_metric/#example-multi-metric-runs", "title": "Example: Multi-metric runs\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to evaluate an atom's pipeline on multiple metrics.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multi_metric/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multi_metric/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multiclass_classification/", "title": "Multiclass classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_wine\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX, y = load_wine(return_X_y=True, as_frame=True)\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look X.head() Out[2]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline 0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)\n\n# Fit the pipeline with the selected models\natom.run(\n    models=[\"LR\",\"LDA\", \"RF\"],\n    metric=\"roc_auc_ovr\",\n    n_trials=14,\n    n_bootstrap=5,\n    errors=\"raise\",\n)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run( models=[\"LR\",\"LDA\", \"RF\"], metric=\"roc_auc_ovr\", n_trials=14, n_bootstrap=5, errors=\"raise\", )
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\nParallel processing with 16 cores.\nParallelization backend: loky\n\nDataset stats ==================== >>\nShape: (178, 14)\nTrain set size: 143\nTest set size: 35\n-------------------------------------\nMemory: 19.36 kB\nScaled: False\nOutlier values: 12 (0.6%)\n\n\nTraining ========================= >>\nModels: LR, LDA, RF\nMetric: roc_auc_ovr\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |      l1 |  0.0054 |    saga |      480 |      0.7 |         0.5 |              0.5 |    10.567s | 10.567s | COMPLETE |\n| 1     |      l1 |   0.122 |    saga |      380 |      0.7 |      0.9951 |           0.9951 |    11.247s | 21.814s | COMPLETE |\n| 2     |      l2 |  0.0071 |     sag |      720 |      0.3 |         1.0 |              1.0 |    12.060s | 33.874s | COMPLETE |\n| 3     |      l1 | 87.9641 | libli.. |      920 |      0.3 |         1.0 |              1.0 |    10.158s | 44.032s | COMPLETE |\n| 4     |      l2 |  0.0114 |     sag |      630 |      0.7 |         1.0 |              1.0 |     7.990s | 52.022s | COMPLETE |\n| 5     |      l2 |  0.0018 |     sag |      920 |      0.1 |         1.0 |              1.0 |    11.685s | 01m:04s | COMPLETE |\n| 6     |      l2 | 43.4053 |     sag |      780 |      0.3 |         1.0 |              1.0 |     8.361s | 01m:12s | COMPLETE |\n| 7     |      l2 |  2.0759 | libli.. |      470 |      0.2 |         1.0 |              1.0 |     8.213s | 01m:20s | COMPLETE |\n| 8     |    None |   0.043 |     sag |      110 |      1.0 |         1.0 |              1.0 |     7.450s | 01m:28s | COMPLETE |\n| 9     |      l1 | 46.0233 |    saga |      740 |      0.1 |         1.0 |              1.0 |     7.951s | 01m:36s | COMPLETE |\n| 10    |      l2 |  0.4557 |   lbfgs |      280 |      0.5 |         1.0 |              1.0 |     8.807s | 01m:44s | COMPLETE |\n| 11    |      l2 |  0.0013 | libli.. |      940 |      0.4 |         1.0 |              1.0 |     7.970s | 01m:52s | COMPLETE |\n| 12    |      l2 |  4.8717 | newto.. |      780 |      0.3 |         1.0 |              1.0 |     8.202s | 02m:01s | COMPLETE |\n| 13    |      l2 |  0.0324 | libli.. |     1000 |      0.0 |         1.0 |              1.0 |     7.676s | 02m:08s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 2\nBest parameters:\n --> penalty: l2\n --> C: 0.0071\n --> solver: sag\n --> max_iter: 720\n --> l1_ratio: 0.3\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 02m:08s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 0.9991\nTest evaluation --> roc_auc_ovr: 0.9977\nTime elapsed: 0.542s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9984 \u00b1 0.001\nTime elapsed: 0.603s\n-------------------------------------------------\nTime: 02m:09s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |    lsqr |       0.9 |      0.9221 |           0.9221 |     0.048s |  0.048s | COMPLETE |\n| 1     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.027s |  0.074s | COMPLETE |\n| 2     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.001s |  0.075s | COMPLETE |\n| 3     |    lsqr |       0.7 |      0.8638 |           0.9221 |     0.025s |  0.100s | COMPLETE |\n| 4     |   eigen |       0.7 |      0.9019 |           0.9221 |     0.024s |  0.124s | COMPLETE |\n| 5     |    lsqr |      auto |         1.0 |              1.0 |     0.025s |  0.149s | COMPLETE |\n| 6     |   eigen |       1.0 |      0.9121 |              1.0 |     0.000s |  0.149s | COMPLETE |\n| 7     |    lsqr |       1.0 |      0.9445 |              1.0 |     0.026s |  0.175s | COMPLETE |\n| 8     |     svd |      None |         1.0 |              1.0 |     0.025s |  0.200s | COMPLETE |\n| 9     |     svd |      None |         1.0 |              1.0 |     0.001s |  0.201s | COMPLETE |\n| 10    |    lsqr |      auto |         1.0 |              1.0 |     0.002s |  0.203s | COMPLETE |\n| 11    |     svd |      None |         1.0 |              1.0 |     0.002s |  0.205s | COMPLETE |\n| 12    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.206s | COMPLETE |\n| 13    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.207s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 5\nBest parameters:\n --> solver: lsqr\n --> shrinkage: auto\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.207s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 1.0\nTest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.025s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9998 \u00b1 0.0005\nTime elapsed: 0.038s\n-------------------------------------------------\nTime: 0.271s\n\n\nRunning hyperparameter tuning for RandomForest...\n| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |          210 |      gini |        10 |                17 |               20 |          0.5 |     False |        None |       0.0 |      0.9803 |           0.9803 |     0.249s |  0.249s | COMPLETE |\n| 1     |          380 |      gini |         4 |                15 |                3 |          0.9 |     False |        None |      0.01 |      0.9816 |           0.9816 |     0.456s |  0.705s | COMPLETE |\n| 2     |          380 |   entropy |         6 |                 2 |               13 |          0.9 |     False |        None |      0.03 |      0.9944 |           0.9944 |     0.502s |  1.206s | COMPLETE |\n| 3     |          470 |      gini |        11 |                 9 |               18 |          nan |      True |         0.6 |     0.025 |      0.9569 |           0.9944 |     9.106s | 10.312s | COMPLETE |\n| 4     |          100 |   entropy |        12 |                14 |                6 |          0.9 |     False |         nan |     0.035 |         1.0 |              1.0 |     8.530s | 18.842s | COMPLETE |\n| 5     |          470 |   entropy |        13 |                11 |                1 |          nan |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.391s | 20.233s | COMPLETE |\n| 6     |          250 |      gini |        14 |                13 |               17 |          0.7 |      True |         nan |      0.02 |         1.0 |              1.0 |     0.754s | 20.987s | COMPLETE |\n| 7     |          220 |      gini |         5 |                10 |                7 |          0.5 |      True |         0.9 |     0.035 |      0.9981 |              1.0 |     0.712s | 21.699s | COMPLETE |\n| 8     |          130 |   entropy |         4 |                 6 |               11 |          0.9 |     False |         nan |      0.03 |         1.0 |              1.0 |     0.532s | 22.231s | COMPLETE |\n| 9     |          370 |      gini |        12 |                 2 |                4 |          0.5 |     False |         nan |      0.02 |      0.9916 |              1.0 |     0.823s | 23.055s | COMPLETE |\n| 10    |           10 |   entropy |        12 |                20 |                7 |         log2 |     False |         nan |     0.035 |         1.0 |              1.0 |     0.522s | 23.577s | COMPLETE |\n| 11    |           70 |   entropy |        13 |                12 |                1 |         None |      True |         0.5 |      0.01 |      0.9928 |              1.0 |     0.614s | 24.191s | COMPLETE |\n| 12    |          500 |   entropy |         9 |                 7 |                7 |          0.6 |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.139s | 25.330s | COMPLETE |\n| 13    |          140 |   entropy |        16 |                16 |                1 |          0.8 |      True |         0.7 |       0.0 |         1.0 |              1.0 |     0.750s | 26.080s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> n_estimators: 100\n --> criterion: entropy\n --> max_depth: 12\n --> min_samples_split: 14\n --> min_samples_leaf: 6\n --> max_features: 0.9\n --> bootstrap: False\n --> max_samples: None\n --> ccp_alpha: 0.035\nBest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 26.080s\nFit ---------------------------------------------\nTrain evaluation --> roc_auc_ovr: 0.9993\nTest evaluation --> roc_auc_ovr: 1.0\nTime elapsed: 0.737s\nBootstrap ---------------------------------------\nEvaluation --> roc_auc_ovr: 0.9936 \u00b1 0.0067\nTime elapsed: 0.721s\n-------------------------------------------------\nTime: 27.539s\n\n\nFinal results ==================== >>\nTotal time: 02m:40s\n-------------------------------------\nLogisticRegression         --> roc_auc_ovr: 0.9984 \u00b1 0.001\nLinearDiscriminantAnalysis --> roc_auc_ovr: 0.9998 \u00b1 0.0005 !\nRandomForest               --> roc_auc_ovr: 0.9936 \u00b1 0.0067\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.results\n
                                                                                                                                                                                                                                                                                                                                                                  atom.results Out[4]: roc_auc_ovr_ht time_ht roc_auc_ovr_train roc_auc_ovr_test time_fit roc_auc_ovr_bootstrap time_bootstrap time LR 1.0 128.337325 0.9979 0.9977 0.542487 0.998413 0.602810 129.482622 LDA 1.0 0.207456 1.0000 0.9989 0.025409 0.999773 0.038035 0.270900 RF 1.0 26.080413 0.9951 0.9919 0.737324 0.993613 0.721398 27.539135 In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Show the score for some different metrics\natom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"])\n
                                                                                                                                                                                                                                                                                                                                                                  # Show the score for some different metrics atom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"]) Out[5]: precision_macro recall_macro jaccard_weighted LR 0.9429 0.9484 0.8924 LDA 0.9667 0.9762 0.9457 RF 0.8799 0.8915 0.7968 In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Some plots allow you to choose the target class to look at\natom.rf.plot_probabilities(rows=\"train\", target=0)\n
                                                                                                                                                                                                                                                                                                                                                                  # Some plots allow you to choose the target class to look at atom.rf.plot_probabilities(rows=\"train\", target=0) In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.lda.plot_shap_heatmap(target=2, show=7)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.lda.plot_shap_heatmap(target=2, show=7)"}, {"location": "examples/multiclass_classification/#example-multiclass-classification", "title": "Example: Multiclass classification\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to compare the performance of three models on a multiclass classification task.

                                                                                                                                                                                                                                                                                                                                                                  Import the wine dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multiclass_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multilabel_classification/", "title": "Multilabel classification", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_multilabel_classification\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_multilabel_classification In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create data\nX, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create data X, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that for multioutput tasks, you must specify the `y` keyword\natom = ATOMClassifier(X, y=y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that for multioutput tasks, you must specify the `y` keyword atom = ATOMClassifier(X, y=y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multilabel classification.\n\nDataset stats ==================== >>\nShape: (300, 23)\nTrain set size: 240\nTest set size: 60\n-------------------------------------\nMemory: 51.73 kB\nScaled: False\nOutlier values: 29 (0.5%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Show the models that natively support multilabel tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # Show the models that natively support multilabel tasks atom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]] Out[4]: acronym model native_multilabel 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost False 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM False 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM False 18 LR LogisticRegression False 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive False 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent False 28 SVM SupportVectorMachine False 29 XGB XGBoost False In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LDA, RF\nMetric: recall_weighted\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 0.9124\nTest evaluation --> recall_weighted: 0.8351\nTime elapsed: 0.037s\n-------------------------------------------------\nTime: 0.037s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 1.0\nTest evaluation --> recall_weighted: 0.8763\nTime elapsed: 0.170s\n-------------------------------------------------\nTime: 0.170s\n\n\nFinal results ==================== >>\nTotal time: 0.269s\n-------------------------------------\nLinearDiscriminantAnalysis --> recall_weighted: 0.8351\nRandomForest               --> recall_weighted: 0.8763 !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that non-native multioutput models use a meta-estimator wrapper\nprint(f\"Estimator for LDA is: {atom.lda.estimator}\")\nprint(f\"Estimator for RF is: {atom.rf.estimator}\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that non-native multioutput models use a meta-estimator wrapper print(f\"Estimator for LDA is: {atom.lda.estimator}\") print(f\"Estimator for RF is: {atom.rf.estimator}\")
                                                                                                                                                                                                                                                                                                                                                                  Estimator for LDA is: ClassifierChain(base_estimator=LinearDiscriminantAnalysis(), random_state=1)\nEstimator for RF is: RandomForestClassifier(n_jobs=1, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMModel\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.linear_model import LogisticRegression\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\ncustom_model = ATOMModel(\n    estimator=ClassifierChain(LogisticRegression(), cv=3),\n    name=\"chain\",\n    needs_scaling=True,\n    native_multilabel=True,\n)\n\natom.run(\n    models=custom_model,\n    n_trials=5,\n    ht_params={\n        \"distributions\": {\n            \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),\n            \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),\n            \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),            \n        }\n    },\n)\n
                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMModel from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from optuna.distributions import CategoricalDistribution, IntDistribution custom_model = ATOMModel( estimator=ClassifierChain(LogisticRegression(), cv=3), name=\"chain\", needs_scaling=True, native_multilabel=True, ) atom.run( models=custom_model, n_trials=5, ht_params={ \"distributions\": { \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]), \"base_estimator__max_iter\": IntDistribution(100, 200, step=10), \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]), } }, )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: chain\nMetric: recall_weighted\n\n\nRunning hyperparameter tuning for ClassifierChain...\n| trial |     order | base_estimator__max_iter | base_estimator__solver | recall_weighted | best_recall_weighted | time_trial | time_ht |    state |\n| ----- | --------- | ------------------------ | ---------------------- | --------------- | -------------------- | ---------- | ------- | -------- |\n| 0     | [2, 1, 0] |                      130 |                  lbfgs |          0.8831 |               0.8831 |     2.813s |  2.813s | COMPLETE |\n| 1     | [1, 2, 0] |                      150 |              newton-cg |          0.9091 |               0.9091 |     2.184s |  4.997s | COMPLETE |\n| 2     | [2, 1, 0] |                      170 |              newton-cg |          0.8701 |               0.9091 |     0.085s |  5.082s | COMPLETE |\n| 3     | [1, 2, 0] |                      200 |              newton-cg |          0.9221 |               0.9221 |     0.084s |  5.166s | COMPLETE |\n| 4     | [2, 1, 0] |                      100 |              newton-cg |          0.8701 |               0.9221 |     0.078s |  5.244s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 3\nBest parameters:\n --> order: [1, 2, 0]\n --> base_estimator__max_iter: 200\n --> base_estimator__solver: newton-cg\nBest evaluation --> recall_weighted: 0.9221\nTime elapsed: 5.244s\nFit ---------------------------------------------\nTrain evaluation --> recall_weighted: 0.9021\nTest evaluation --> recall_weighted: 0.866\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 5.345s\n\n\nFinal results ==================== >>\nTotal time: 5.397s\n-------------------------------------\nClassifierChain --> recall_weighted: 0.866\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  thresholds = atom.rf.get_best_threshold()\nprint(f\"Best threshold per target column: {thresholds}\")\n
                                                                                                                                                                                                                                                                                                                                                                  thresholds = atom.rf.get_best_threshold() print(f\"Best threshold per target column: {thresholds}\")
                                                                                                                                                                                                                                                                                                                                                                  Best threshold per target column: [0.7, 0.69, 0.63]\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.rf.evaluate(threshold=thresholds)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.rf.evaluate(threshold=thresholds) Out[9]:
                                                                                                                                                                                                                                                                                                                                                                  accuracy              0.5667\nap                    0.8893\nf1_weighted           0.7274\njaccard_weighted      0.6271\nprecision_weighted    0.8269\nrecall_weighted       0.6495\nauc                   0.9213\nName: RF, dtype: float64
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use\natom.plot_roc(target=2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use atom.plot_roc(target=2) In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # When the target parameter also specifies the class, use format (column, class)\natom.plot_probabilities(models=\"chain\", target=(2, 1))\n
                                                                                                                                                                                                                                                                                                                                                                  # When the target parameter also specifies the class, use format (column, class) atom.plot_probabilities(models=\"chain\", target=(2, 1)) In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(figsize=(900, 600)):\n    atom.plot_calibration(target=0)\n    atom.plot_calibration(target=1)\n
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(figsize=(900, 600)): atom.plot_calibration(target=0) atom.plot_calibration(target=1)"}, {"location": "examples/multilabel_classification/#example-multilabel-classification", "title": "Example: Multilabel classification\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to solve a multilabel classification problem.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_multilabel_classification function.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multilabel_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#add-custom-multilabel-models", "title": "Add custom multilabel models\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  To use your own meta-estimator with custom parameters, add it as a custom model. It's also possible to tune the hyperparameters of this custom meta-estimator.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multilabel_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multioutput_regression/", "title": "Multioutput regression", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport numpy as np\nfrom atom import ATOMRegressor, ATOMModel\nfrom sklearn.datasets import make_regression\n\nfrom scikeras.wrappers import KerasRegressor\nfrom keras.models import Sequential\nfrom keras.layers import Dense\n
                                                                                                                                                                                                                                                                                                                                                                  # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" from tensorflow import get_logger get_logger().setLevel('ERROR') import numpy as np from atom import ATOMRegressor, ATOMModel from sklearn.datasets import make_regression from scikeras.wrappers import KerasRegressor from keras.models import Sequential from keras.layers import Dense In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create data\nX, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create data X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create the neural network\nclass NeuralNetwork(KerasRegressor):\n    \"\"\"Multioutput multilayer perceptron.\"\"\"\n\n    @staticmethod\n    def _keras_build_fn(n_inputs, n_outputs, **kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))\n        model.add(Dense(20, activation=\"relu\"))\n        model.add(Dense(n_outputs))\n        model.compile(loss=\"mse\", optimizer=\"adam\")\n        return model\n
                                                                                                                                                                                                                                                                                                                                                                  # Create the neural network class NeuralNetwork(KerasRegressor): \"\"\"Multioutput multilayer perceptron.\"\"\" @staticmethod def _keras_build_fn(n_inputs, n_outputs, **kwargs): \"\"\"Create the model's architecture.\"\"\" model = Sequential() model.add(Dense(20, input_dim=n_inputs, activation=\"relu\")) model.add(Dense(20, activation=\"relu\")) model.add(Dense(n_outputs)) model.compile(loss=\"mse\", optimizer=\"adam\") return model In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),\n    name=\"NN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Convert the model to an ATOM model model = ATOMModel( estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0), name=\"NN\", needs_scaling=True, # Applies automated feature scaling before fitting native_multioutput=True, # Do not use a multioutput meta-estimator wrapper ) In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multioutput regression.\n\nDataset stats ==================== >>\nShape: (1000, 13)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 104.13 kB\nScaled: True\nOutlier values: 27 (0.3%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Show the models that natively support multioutput tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # Show the models that natively support multioutput tasks atom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]] Out[6]: acronym model native_multioutput 0 AdaB AdaBoost False 1 ARD AutomaticRelevanceDetermination False 2 Bag Bagging False 3 BR BayesianRidge False 4 CatB CatBoost False 5 Tree DecisionTree True 6 Dummy Dummy False 7 EN ElasticNet False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GP GaussianProcess False 11 GBM GradientBoostingMachine False 12 Huber HuberRegression False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 Lasso Lasso False 16 Lars LeastAngleRegression False 17 LGB LightGBM False 18 lSVM LinearSVM False 19 MLP MultiLayerPerceptron False 20 OLS OrdinaryLeastSquares False 21 OMP OrthogonalMatchingPursuit False 22 PA PassiveAggressive False 23 RNN RadiusNearestNeighbors True 24 RF RandomForest True 25 Ridge Ridge False 26 SGD StochasticGradientDescent False 27 SVM SupportVectorMachine False 28 XGB XGBoost False In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note we only added 5 informative features to the dataset, let's remove the rest\n# If we use a model with no native support for multioutput as solver, specify the\n# rfe's importance_getter parameter and return the mean of the coefficients over the\n# target columns\natom.feature_selection(\n    strategy=\"rfe\",\n    solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)\n    n_features=5,\n    importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0),\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note we only added 5 informative features to the dataset, let's remove the rest # If we use a model with no native support for multioutput as solver, specify the # rfe's importance_getter parameter and return the mean of the coefficients over the # target columns atom.feature_selection( strategy=\"rfe\", solver=\"ols\", # This becomes MultiOutputRegressor(OLS) n_features=5, importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0), )
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> rfe selected 5 features from the dataset.\n   --> Dropping feature x0 (rank 6).\n   --> Dropping feature x5 (rank 5).\n   --> Dropping feature x6 (rank 3).\n   --> Dropping feature x7 (rank 2).\n   --> Dropping feature x9 (rank 4).\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's train a native, non-native and our custom model\natom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's train a native, non-native and our custom model atom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: Lasso, RF, NN\nMetric: mse\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --> mse: -5.1516\nTest evaluation --> mse: -5.5774\nTime elapsed: 0.031s\n-------------------------------------------------\nTime: 0.031s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> mse: -200.7336\nTest evaluation --> mse: -1494.3406\nTime elapsed: 0.706s\n-------------------------------------------------\nTime: 0.706s\n\n\nResults for NeuralNetwork:\nFit ---------------------------------------------\nTrain evaluation --> mse: -111.3789\nTest evaluation --> mse: -105.2649\nTime elapsed: 2.372s\n-------------------------------------------------\nTime: 2.372s\n\n\nFinal results ==================== >>\nTotal time: 3.116s\n-------------------------------------\nLasso         --> mse: -5.5774 !\nRandomForest  --> mse: -1494.3406 ~\nNeuralNetwork --> mse: -105.2649\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # And check which of the models used a meta-estimator wrapper\nfor m in atom.models:\n    print(f\"Estimator for {m} is: {atom[m].estimator}\")\n
                                                                                                                                                                                                                                                                                                                                                                  # And check which of the models used a meta-estimator wrapper for m in atom.models: print(f\"Estimator for {m} is: {atom[m].estimator}\")
                                                                                                                                                                                                                                                                                                                                                                  Estimator for Lasso is: MultiOutputRegressor(estimator=Lasso(random_state=1), n_jobs=1)\nEstimator for RF is: RandomForestRegressor(n_jobs=1, random_state=1)\nEstimator for NN is: NeuralNetwork(\n\tmodel=None\n\tbuild_fn=None\n\twarm_start=False\n\trandom_state=1\n\toptimizer=rmsprop\n\tloss=None\n\tmetrics=None\n\tbatch_size=None\n\tvalidation_batch_size=None\n\tverbose=0\n\tcallbacks=None\n\tvalidation_split=0.0\n\tshuffle=True\n\trun_eagerly=False\n\tepochs=100\n\tn_inputs=5\n\tn_outputs=3\n\tname=NN\n\tneeds_scaling=True\n\tnative_multioutput=True\n\tnative_multilabel=False\n\thas_validation=None\n)\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use\natom.plot_residuals(target=2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the target parameter in plots to specify which target column to use atom.plot_residuals(target=2) In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(3, 1, figsize=(900, 1300)):\n    atom.plot_errors(target=0)\n    atom.plot_errors(target=1)\n    atom.plot_errors(target=2)\n
                                                                                                                                                                                                                                                                                                                                                                  with atom.canvas(3, 1, figsize=(900, 1300)): atom.plot_errors(target=0) atom.plot_errors(target=1) atom.plot_errors(target=2)
                                                                                                                                                                                                                                                                                                                                                                  \n---------------------------------------------------------------------------\nValueError                                Traceback (most recent call last)\nCell In[11], line 2\n      1 with atom.canvas(3, 1, figsize=(900, 1300)):\n----> 2     atom.plot_errors(target=0)\n      3     atom.plot_errors(target=1)\n      4     atom.plot_errors(target=2)\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2712, in crash.<locals>.wrapper(*args, **kwargs)\n   2709     cache[\"last_exception\"] = ex\n   2710     args[0].logger.exception(\"Exception encountered:\")\n-> 2712 raise ex\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2704, in crash.<locals>.wrapper(*args, **kwargs)\n   2701 @wraps(f)\n   2702 def wrapper(*args, **kwargs) -> Any:\n   2703     try:  # Run the function\n-> 2704         return f(*args, **kwargs)\n   2706     except Exception as ex:\n   2707         # If exception is not the same as last, write to log\n   2708         if ex is not cache[\"last_exception\"] and args[0].logger:\n\nFile ~\\Documents\\Python\\ATOM\\atom\\plots\\predictionplot.py:691, in PredictionPlot.plot_errors(self, models, rows, target, title, legend, figsize, filename, display)\n    689         from atom.models import OrdinaryLeastSquares\n    690         model = OrdinaryLeastSquares(goal=self.task.goal, branches=self._branches)\n--> 691         estimator = model._get_est().fit(bk.DataFrame(y_true), y_pred)\n    693         fig.add_trace(\n    694             self._draw_line(\n    695                 x=(x := np.linspace(y_true.min(), y_true.max(), 100)),\n   (...)\n    703             )\n    704         )\n    706 self._draw_straight_line(y=\"diagonal\", xaxis=xaxis, yaxis=yaxis)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)\n   1145     estimator._validate_params()\n   1147 with config_context(\n   1148     skip_parameter_validation=(\n   1149         prefer_skip_nested_validation or global_skip_validation\n   1150     )\n   1151 ):\n-> 1152     return fit_method(estimator, *args, **kwargs)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\multioutput.py:248, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)\n    245     check_classification_targets(y)\n    247 if y.ndim == 1:\n--> 248     raise ValueError(\n    249         \"y must have at least two dimensions for \"\n    250         \"multi-output regression but has only one.\"\n    251     )\n    253 if _routing_enabled():\n    254     routed_params = process_routing(\n    255         obj=self,\n    256         method=\"fit\",\n    257         other_params=fit_params,\n    258         sample_weight=sample_weight,\n    259     )\n\nValueError: y must have at least two dimensions for multi-output regression but has only one.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multioutput_regression/#example-multioutput-regression", "title": "Example: Multioutput regression\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to make preditions on a multioutput regression dataset. One of the models used is a MLP regressor implemented with Keras using scikeras.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_regression function.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/multioutput_regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/nlp/", "title": "NLP", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  import numpy as np\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import fetch_20newsgroups\n
                                                                                                                                                                                                                                                                                                                                                                  import numpy as np from atom import ATOMClassifier from sklearn.datasets import fetch_20newsgroups In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use only a subset of the available topics for faster processing\nX_text, y_text = fetch_20newsgroups(\n    return_X_y=True,\n    categories=[\n        'sci.med',\n        'comp.windows.x',\n        'misc.forsale',\n        'rec.autos',\n    ],\n    shuffle=True,\n    random_state=1,\n)\nX_text = np.array(X_text).reshape(-1, 1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use only a subset of the available topics for faster processing X_text, y_text = fetch_20newsgroups( return_X_y=True, categories=[ 'sci.med', 'comp.windows.x', 'misc.forsale', 'rec.autos', ], shuffle=True, random_state=1, ) X_text = np.array(X_text).reshape(-1, 1) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== >>\nShape: (2366, 2)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 122.87 kB\nScaled: False\nCategorical features: 1 (100.0%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset  # Note that the feature is automatically named 'corpus'\n
                                                                                                                                                                                                                                                                                                                                                                  atom.dataset # Note that the feature is automatically named 'corpus' Out[4]: corpus target 1731 From: rlm@helen.surfcty.com (Robert L. McMilli... 0 1496 From: carl@SOL1.GPS.CALTECH.EDU (Carl J Lydick... 3 1290 From: thssjxy@iitmax.iit.edu (Smile)\\nSubject:... 1 2021 From: c23st@kocrsv01.delcoelect.com (Spiros Tr... 2 142 From: ginkgo@ecsvax.uncecs.edu (J. Geary Morto... 1 ... ... ... 510 From: mary@uicsl.csl.uiuc.edu (Mary E. Allison... 3 1948 From: ndd@sunbar.mc.duke.edu (Ned Danieley)\\nS... 0 798 From: kk@unisql.UUCP (Kerry Kimbrough)\\nSubjec... 0 2222 From: hamachi@adobe.com (Gordon Hamachi)\\nSubj... 2 2215 From: mobasser@vu-vlsi.ee.vill.edu (Bijan Moba... 2

                                                                                                                                                                                                                                                                                                                                                                  2366 rows \u00d7 2 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the first document\natom.corpus[0]\n
                                                                                                                                                                                                                                                                                                                                                                  # Let's have a look at the first document atom.corpus[0] Out[5]:
                                                                                                                                                                                                                                                                                                                                                                  'From: caf@omen.UUCP (Chuck Forsberg WA7KGX)\\nSubject: Re: My New Diet --> IT WORKS GREAT !!!!\\nOrganization: Omen Technology INC, Portland Rain Forest\\nLines: 32\\n\\nIn article <1qk6v3INNrm6@lynx.unm.edu> bhjelle@carina.unm.edu () writes:\\n>\\n>Gordon Banks:\\n>\\n>>a lot to keep from going back to morbid obesity.  I think all\\n>>of us cycle.  One\\'s success depends on how large the fluctuations\\n>>in the cycle are.  Some people can cycle only 5 pounds.  Unfortunately,\\n>>I\\'m not one of them.\\n>>\\n>>\\n>This certainly describes my situation perfectly. For me there is\\n>a constant dynamic between my tendency to eat, which appears to\\n>be totally limitless, and the purely conscious desire to not\\n>put on too much weight. When I get too fat, I just diet/exercise\\n>more (with varying degrees of success) to take off the\\n>extra weight. Usually I cycle within a 15 lb range, but\\n>smaller and larger cycles occur as well. I\\'m always afraid\\n>that this method will stop working someday, but usually\\n>I seem to be able to hold the weight gain in check.\\n>This is one reason I have a hard time accepting the notion\\n>of some metabolic derangement associated with cycle dieting\\n>(that results in long-term weight gain). I have been cycle-\\n>dieting for at least 20 years without seeing such a change.\\n\\nAs mentioned in Adiposity 101, only some experience weight\\nrebound.  The fact that you don\\'t doesn\\'t prove it doesn\\'t\\nhappen to others.\\n-- \\nChuck Forsberg WA7KGX          ...!tektronix!reed!omen!caf \\nAuthor of YMODEM, ZMODEM, Professional-YAM, ZCOMM, and DSZ\\n  Omen Technology Inc    \"The High Reliability Software\"\\n17505-V NW Sauvie IS RD   Portland OR 97231   503-621-3406\\n'
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Clean the documents from noise (emails, numbers, etc...)\natom.textclean()\n
                                                                                                                                                                                                                                                                                                                                                                  # Clean the documents from noise (emails, numbers, etc...) atom.textclean()
                                                                                                                                                                                                                                                                                                                                                                  Fitting TextCleaner...\nCleaning the corpus...\n --> Decoding unicode characters to ascii.\n --> Converting text to lower case.\n --> Dropping emails from documents.\n --> Dropping URL links from documents.\n --> Dropping HTML tags from documents.\n --> Dropping emojis from documents.\n --> Dropping numbers from documents.\n --> Dropping punctuation from the text.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check how the first document changed\natom.corpus[0]\n
                                                                                                                                                                                                                                                                                                                                                                  # Check how the first document changed atom.corpus[0] Out[7]:
                                                                                                                                                                                                                                                                                                                                                                  'from  chuck forsberg wa7kgx\\nsubject re my new diet  it works great \\norganization omen technology inc portland rain forest\\nlines \\n\\nin article    writes\\n\\ngordon banks\\n\\na lot to keep from going back to morbid obesity  i think all\\nof us cycle  ones success depends on how large the fluctuations\\nin the cycle are  some people can cycle only  pounds  unfortunately\\nim not one of them\\n\\n\\nthis certainly describes my situation perfectly for me there is\\na constant dynamic between my tendency to eat which appears to\\nbe totally limitless and the purely conscious desire to not\\nput on too much weight when i get too fat i just dietexercise\\nmore with varying degrees of success to take off the\\nextra weight usually i cycle within a  lb range but\\nsmaller and larger cycles occur as well im always afraid\\nthat this method will stop working someday but usually\\ni seem to be able to hold the weight gain in check\\nthis is one reason i have a hard time accepting the notion\\nof some metabolic derangement associated with cycle dieting\\nthat results in longterm weight gain i have been cycle\\ndieting for at least  years without seeing such a change\\n\\nas mentioned in adiposity  only some experience weight\\nrebound  the fact that you dont doesnt prove it doesnt\\nhappen to others\\n \\nchuck forsberg wa7kgx          tektronixreedomencaf \\nauthor of ymodem zmodem professionalyam zcomm and dsz\\n  omen technology inc    the high reliability software\\nv nw sauvie is rd   portland or    \\n'
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Convert the strings to a sequence of words\natom.tokenize()\n
                                                                                                                                                                                                                                                                                                                                                                  # Convert the strings to a sequence of words atom.tokenize()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Tokenizer...\nTokenizing the corpus...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Print the first few words of the first document\natom.corpus[0][:7]\n
                                                                                                                                                                                                                                                                                                                                                                  # Print the first few words of the first document atom.corpus[0][:7] Out[9]:
                                                                                                                                                                                                                                                                                                                                                                  ['from', 'chuck', 'forsberg', 'wa7kgx', 'subject', 're', 'my']
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Normalize the text to a predefined standard\natom.textnormalize(stopwords=\"english\", lemmatize=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Normalize the text to a predefined standard atom.textnormalize(stopwords=\"english\", lemmatize=True)
                                                                                                                                                                                                                                                                                                                                                                  Fitting TextNormalizer...\nNormalizing the corpus...\n --> Dropping stopwords.\n --> Applying lemmatization.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.corpus[0][:7]  # Check changes...\n
                                                                                                                                                                                                                                                                                                                                                                  atom.corpus[0][:7] # Check changes... Out[11]:
                                                                                                                                                                                                                                                                                                                                                                  ['chuck', 'forsberg', 'wa7kgx', 'subject', 'new', 'diet', 'work']
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the most common words with a wordcloud\natom.plot_wordcloud(figsize=(700, 500))\n
                                                                                                                                                                                                                                                                                                                                                                  # Visualize the most common words with a wordcloud atom.plot_wordcloud(figsize=(700, 500)) In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Have a look at the most frequent bigrams\natom.plot_ngrams(2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Have a look at the most frequent bigrams atom.plot_ngrams(2) In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create the bigrams using the tokenizer\natom.tokenize(bigram_freq=215)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create the bigrams using the tokenizer atom.tokenize(bigram_freq=215)
                                                                                                                                                                                                                                                                                                                                                                  Fitting Tokenizer...\nTokenizing the corpus...\n --> Creating 7 bigrams on 3128 locations.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.bigrams_\n
                                                                                                                                                                                                                                                                                                                                                                  atom.bigrams_ Out[15]: bigram frequency 0 x_x 1168 1 line_article 532 2 line_nntppostinghost 389 3 organization_university 331 4 gordon_bank 266 5 distribution_usa 227 6 line_distribution 215 In\u00a0[16]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # As a last step before modelling, convert the words to vectors\natom.vectorize(strategy=\"tfidf\")\n
                                                                                                                                                                                                                                                                                                                                                                  # As a last step before modelling, convert the words to vectors atom.vectorize(strategy=\"tfidf\")
                                                                                                                                                                                                                                                                                                                                                                  Fitting Vectorizer...\nVectorizing the corpus...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The dimensionality of the dataset has increased a lot!\natom.shape\n
                                                                                                                                                                                                                                                                                                                                                                  # The dimensionality of the dataset has increased a lot! atom.shape Out[17]:
                                                                                                                                                                                                                                                                                                                                                                  (2366, 24176)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[18]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that the data is sparse and the columns are named\n# after the words they are embedding\natom.dtypes\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that the data is sparse and the columns are named # after the words they are embedding atom.dtypes Out[18]:
                                                                                                                                                                                                                                                                                                                                                                  corpus_000000e5    Sparse[float64, 0]\ncorpus_00000ee5    Sparse[float64, 0]\ncorpus_000010af    Sparse[float64, 0]\ncorpus_0007259d    Sparse[float64, 0]\ncorpus_00072a27    Sparse[float64, 0]\n                          ...        \ncorpus_zurich      Sparse[float64, 0]\ncorpus_zvi         Sparse[float64, 0]\ncorpus_zx          Sparse[float64, 0]\ncorpus_zz          Sparse[float64, 0]\ntarget                          int64\nLength: 24176, dtype: object
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[19]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # When the dataset is sparse, stats() shows the density\natom.stats()\n
                                                                                                                                                                                                                                                                                                                                                                  # When the dataset is sparse, stats() shows the density atom.stats()
                                                                                                                                                                                                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (2366, 24176)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 2.54 MB\nSparse: True\nDensity: 0.35%\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[20]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check which models have support for sparse matrices\natom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]]\n
                                                                                                                                                                                                                                                                                                                                                                  # Check which models have support for sparse matrices atom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]] Out[20]: acronym model accepts_sparse 0 AdaB AdaBoost True 1 Bag Bagging True 2 BNB BernoulliNB True 3 CatB CatBoost True 4 CatNB CategoricalNB True 5 CNB ComplementNB True 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine True 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB True 21 PA PassiveAggressive True 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[21]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Train the model\natom.run(models=\"RF\", metric=\"f1_weighted\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Train the model atom.run(models=\"RF\", metric=\"f1_weighted\")
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: RF\nMetric: f1_weighted\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> f1_weighted: 1.0\nTest evaluation --> f1_weighted: 0.9181\nTime elapsed: 02m:24s\n-------------------------------------------------\nTime: 02m:24s\n\n\nFinal results ==================== >>\nTotal time: 02m:24s\n-------------------------------------\nRandomForest --> f1_weighted: 0.9181\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[22]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.evaluate() Out[22]: ba f1_weighted jaccard_weighted mcc precision_weighted recall_weighted RF 0.9183 0.9181 0.8486 0.8918 0.9206 0.9182 In\u00a0[23]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_confusion_matrix(figsize=(700, 600))\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_confusion_matrix(figsize=(700, 600)) In\u00a0[24]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_shap_decision(rows=0, show=15)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_shap_decision(rows=0, show=15) In\u00a0[25]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_shap_beeswarm(target=0, show=15)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_shap_beeswarm(target=0, show=15)
                                                                                                                                                                                                                                                                                                                                                                  100%|===================| 2827/2836 [02:38<00:00]        
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/nlp/#example-nlp", "title": "Example: NLP\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to quickly go from raw text data to model predictions.

                                                                                                                                                                                                                                                                                                                                                                  Import the 20 newsgroups text dataset from sklearn.datasets. The dataset comprises around 18000 articles on 20 topics. The goal is to predict the topic of every article.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/nlp/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/nlp/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/nlp/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/pruning/", "title": "Pruning", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.pruners import HyperbandPruner\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages from sklearn.datasets import load_breast_cancer from optuna.pruners import HyperbandPruner from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = load_breast_cancer(return_X_y=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use ht_params to specify a custom pruner\n# Note that pruned trials show the number of iterations it completed\natom.run(\n    models=\"SGD\",\n    metric=\"f1\",\n    n_trials=25,\n    ht_params={\n        \"distributions\": [\"penalty\", \"max_iter\"],\n        \"pruner\": HyperbandPruner(),\n    }\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use ht_params to specify a custom pruner # Note that pruned trials show the number of iterations it completed atom.run( models=\"SGD\", metric=\"f1\", n_trials=25, ht_params={ \"distributions\": [\"penalty\", \"max_iter\"], \"pruner\": HyperbandPruner(), } )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: SGD\nMetric: f1\n\n\nRunning hyperparameter tuning for StochasticGradientDescent...\n| trial | penalty | max_iter |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |      l1 |      650 |  0.9558 |  0.9558 |     2.801s |  2.801s | COMPLETE |\n| 1     | elast.. |     1050 |  0.9744 |  0.9744 |     4.590s |  7.390s | COMPLETE |\n| 2     | elast.. |      500 |  0.9828 |  0.9828 |     0.033s |  7.423s |   PRUNED |\n| 3     |    None |      700 |  0.9739 |  0.9828 |     2.951s | 10.374s | COMPLETE |\n| 4     |      l1 |     1400 |  0.9735 |  0.9828 |     0.033s | 10.407s |   PRUNED |\n| 5     |    None |     1400 |  0.9735 |  0.9828 |     5.994s | 16.401s | COMPLETE |\n| 6     |      l2 |     1200 |  0.9825 |  0.9828 |     5.246s | 21.647s | COMPLETE |\n| 7     |      l2 |     1250 |  0.9825 |  0.9828 |     5.436s | 27.083s | COMPLETE |\n| 8     |    None |      600 |  0.9828 |  0.9828 |     0.023s | 27.106s |   PRUNED |\n| 9     |      l1 |      600 |  0.9402 |  0.9828 |     0.030s | 27.136s |   PRUNED |\n| 10    |      l2 |      950 |  0.9565 |  0.9828 |     4.118s | 31.254s | COMPLETE |\n| 11    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.259s | COMPLETE |\n| 12    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.264s | COMPLETE |\n| 13    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.269s | COMPLETE |\n| 14    |      l2 |     1500 |  0.9573 |  0.9828 |     0.038s | 31.306s |   PRUNED |\n| 15    |      l2 |      950 |  0.9565 |  0.9828 |     0.005s | 31.311s | COMPLETE |\n| 16    |      l2 |     1100 |  0.9391 |  0.9828 |     0.040s | 31.351s |   PRUNED |\n| 17    |      l2 |      850 |  0.9831 |  0.9831 |     0.030s | 31.381s |   PRUNED |\n| 18    | elast.. |     1300 |   0.931 |  0.9831 |     0.029s | 31.410s |   PRUNED |\n| 19    |      l2 |     1300 |  0.9649 |  0.9831 |     0.067s | 31.478s |   PRUNED |\n| 20    |      l2 |      800 |  0.9661 |  0.9831 |     0.039s | 31.517s |   PRUNED |\n| 21    |      l2 |     1150 |  0.9402 |  0.9831 |     0.032s | 31.548s |   PRUNED |\n| 22    |      l2 |     1300 |  0.9573 |  0.9831 |     0.038s | 31.586s |   PRUNED |\n| 23    |      l2 |     1250 |  0.9825 |  0.9831 |     0.008s | 31.594s | COMPLETE |\n| 24    |      l2 |     1050 |  0.9565 |  0.9831 |     0.070s | 31.665s |   PRUNED |\nHyperparameter tuning ---------------------------\nBest trial --> 6\nBest parameters:\n --> penalty: l2\n --> max_iter: 1200\nBest evaluation --> f1: 0.9825\nTime elapsed: 31.665s\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.993\nTest evaluation --> f1: 0.9722\nTime elapsed: 8.384s\n-------------------------------------------------\nTime: 40.049s\n\n\nFinal results ==================== >>\nTotal time: 40.301s\n-------------------------------------\nStochasticGradientDescent --> f1: 0.9722\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_trials()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_trials() In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_hyperparameter_importance()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_hyperparameter_importance()"}, {"location": "examples/pruning/#example-pruning", "title": "Example: Pruning\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows an advanced example on how to use hyperparameter tuning with pruning.

                                                                                                                                                                                                                                                                                                                                                                  Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/pruning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/pruning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/pruning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ray_backend/", "title": "Ray backend", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport ray\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import ray import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use a small dataset for illustration purposes\nX, y = make_classification(n_samples=10000, n_features=10, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Use a small dataset for illustration purposes X, y = make_classification(n_samples=10000, n_features=10, random_state=1) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note we already specify the number of cores for parallel execution here\natom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note we already specify the number of cores for parallel execution here atom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  2023-11-04 23:01:00,897\tINFO worker.py:1664 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265 \n
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\nParallel processing with 2 cores.\nParallelization backend: ray\n\nDataset stats ==================== >>\nShape: (10000, 11)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 880.13 kB\nScaled: True\nOutlier values: 211 (0.2%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The ray backend uses modin instead of pandas as data handler\ntype(atom.dataset)\n
                                                                                                                                                                                                                                                                                                                                                                  # The ray backend uses modin instead of pandas as data handler type(atom.dataset) Out[4]:
                                                                                                                                                                                                                                                                                                                                                                  pandas.core.frame.DataFrame
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use data cleaning as usual\natom.scale()\n
                                                                                                                                                                                                                                                                                                                                                                  # Use data cleaning as usual atom.scale()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Scaler...\nScaling features...\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Using `parallel=True`, we train one model in each node\n# Note that when training in parallel, the verbosity of the models is zero\natom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Using `parallel=True`, we train one model in each node # Note that when training in parallel, the verbosity of the models is zero atom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: PA, SGD\nMetric: f1\n\n\nFinal results ==================== >>\nTotal time: 9.407s\n-------------------------------------\nPassiveAggressive         --> f1: 0.8165\nStochasticGradientDescent --> f1: 0.8774 !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Notice how the summed time to train the models is less than the total time\natom.plot_results(metric=\"time_fit\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Notice how the summed time to train the models is less than the total time atom.plot_results(metric=\"time_fit\") In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create a rest API endpoint and do inference on the holdout set\natom.pa.serve(port=8001)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create a rest API endpoint and do inference on the holdout set atom.pa.serve(port=8001) In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  import requests\n\nX_predict = atom.X_test.iloc[:10, :]\nresponse = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json())\n
                                                                                                                                                                                                                                                                                                                                                                  import requests X_predict = atom.X_test.iloc[:10, :] response = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json()) In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  response.json()\n
                                                                                                                                                                                                                                                                                                                                                                  response.json() Out[10]:
                                                                                                                                                                                                                                                                                                                                                                  [1, 1, 0, 0, 1, 1, 0, 1, 0, 0]
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Don't forget to shut down the ray server\nray.shutdown()\n
                                                                                                                                                                                                                                                                                                                                                                  # Don't forget to shut down the ray server ray.shutdown()"}, {"location": "examples/ray_backend/#example-ray-backend", "title": "Example: Ray backend\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use the ray backend to train models in a parallel context.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a synthetic dataset created using sklearn's make_classification function.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/ray_backend/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ray_backend/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ray_backend/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/regression/", "title": "Regression", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\") # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom for regression tasks\natom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom for regression tasks atom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 195 (0.6%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Encode the categorical features\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  # Encode the categorical features atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  Fitting Encoder...\nEncoding categorical columns...\n --> OneHot-encoding feature Sex. Contains 3 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Plot the dataset's correlation matrix\natom.plot_correlation()\n
                                                                                                                                                                                                                                                                                                                                                                  # Plot the dataset's correlation matrix atom.plot_correlation() In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Apply pca for dimensionality reduction\natom.feature_selection(strategy=\"pca\", n_features=6)\n
                                                                                                                                                                                                                                                                                                                                                                  # Apply pca for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6)
                                                                                                                                                                                                                                                                                                                                                                  Fitting FeatureSelector...\nPerforming feature selection ...\n --> Applying Principal Component Analysis...\n   --> Scaling features...\n   --> Keeping 6 components.\n   --> Explained variance ratio: 0.97\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that the fetaures are automatically renamed to pca0, pca1, etc...\natom.columns\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that the fetaures are automatically renamed to pca0, pca1, etc... atom.columns Out[7]:
                                                                                                                                                                                                                                                                                                                                                                  Index(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'Rings'], dtype='object')
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the plotting methods to see the retained variance ratio\natom.plot_pca()\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the plotting methods to see the retained variance ratio atom.plot_pca() In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_components()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_components() In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.run(\n    models=[\"Tree\", \"Bag\", \"ET\"],\n    metric=\"mse\",\n    n_trials=5,\n    n_bootstrap=5,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.run( models=[\"Tree\", \"Bag\", \"ET\"], metric=\"mse\", n_trials=5, n_bootstrap=5, )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: Tree, Bag, ET\nMetric: mse\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial |   criterion | splitter | max_depth | min_samples_split | min_samples_leaf | max_features | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ----------- | -------- | --------- | ----------------- | ---------------- | ------------ | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     | absolute_.. |     best |         5 |                 8 |               10 |         None |     0.035 | -6.5456 |  -6.5456 |     0.255s |  0.255s | COMPLETE |\n| 1     | squared_e.. |     best |        10 |                 5 |                1 |          0.5 |      0.03 | -7.1959 |  -6.5456 |     0.065s |  0.320s | COMPLETE |\n| 2     | absolute_.. |   random |        14 |                15 |               16 |         sqrt |     0.025 | -8.5859 |  -6.5456 |     0.067s |  0.387s | COMPLETE |\n| 3     | friedman_.. |   random |         4 |                10 |               17 |          0.9 |      0.01 | -7.4933 |  -6.5456 |     0.052s |  0.439s | COMPLETE |\n| 4     |     poisson |     best |        12 |                15 |                8 |          0.6 |      0.02 | -5.8126 |  -5.8126 |     0.066s |  0.505s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> criterion: poisson\n --> splitter: best\n --> max_depth: 12\n --> min_samples_split: 15\n --> min_samples_leaf: 8\n --> max_features: 0.6\n --> ccp_alpha: 0.02\nBest evaluation --> mse: -5.8126\nTime elapsed: 0.505s\nFit ---------------------------------------------\nTrain evaluation --> mse: -6.2977\nTest evaluation --> mse: -7.1923\nTime elapsed: 0.045s\nBootstrap ---------------------------------------\nEvaluation --> mse: -7.6026 \u00b1 0.3783\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.660s\n\n\nRunning hyperparameter tuning for Bagging...\n| trial | n_estimators | max_samples | max_features | bootstrap | bootstrap_features |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ----------- | ------------ | --------- | ------------------ | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 |         1.0 |          0.9 |      True |               True | -4.5751 |  -4.5751 |     5.791s |  5.791s | COMPLETE |\n\nException encountered while running the Bag model.\nMemoryError: could not allocate 187712 bytes\n\n\nRunning hyperparameter tuning for ExtraTrees...\n| trial | n_estimators |     criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 | squared_error |         8 |                13 |                3 |          0.5 |      True |         0.6 |     0.025 | -5.1462 |  -5.1462 |     0.285s |  0.285s | COMPLETE |\n| 1     |          230 | absolute_er.. |         8 |                 8 |                8 |         sqrt |      True |         0.6 |       0.0 | -9.3444 |  -5.1462 |     1.377s |  1.662s | COMPLETE |\n| 2     |          180 | absolute_er.. |         7 |                 2 |                3 |          0.6 |      True |         0.6 |      0.03 | -5.7371 |  -5.1462 |     1.738s |  3.400s | COMPLETE |\n| 3     |          100 | squared_error |        14 |                15 |                8 |         None |      True |         0.9 |     0.005 | -5.1938 |  -5.1462 |     0.231s |  3.631s | COMPLETE |\n| 4     |          340 | squared_error |         6 |                15 |                8 |         None |      True |         0.8 |      0.01 | -4.8716 |  -4.8716 |     0.457s |  4.088s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 4\nBest parameters:\n --> n_estimators: 340\n --> criterion: squared_error\n --> max_depth: 6\n --> min_samples_split: 15\n --> min_samples_leaf: 8\n --> max_features: None\n --> bootstrap: True\n --> max_samples: 0.8\n --> ccp_alpha: 0.01\nBest evaluation --> mse: -4.8716\nTime elapsed: 4.088s\nFit ---------------------------------------------\nTrain evaluation --> mse: -5.4808\nTest evaluation --> mse: -6.3445\nTime elapsed: 0.535s\nBootstrap ---------------------------------------\nEvaluation --> mse: -6.3694 \u00b1 0.0737\nTime elapsed: 2.245s\n-------------------------------------------------\nTime: 6.868s\n\n\nFinal results ==================== >>\nTotal time: 32.361s\n-------------------------------------\nDecisionTree --> mse: -7.6026 \u00b1 0.3783\nExtraTrees   --> mse: -6.3694 \u00b1 0.0737 !\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use the errors or residuals plots to check the model performances\natom.plot_residuals()\n
                                                                                                                                                                                                                                                                                                                                                                  # Use the errors or residuals plots to check the model performances atom.plot_residuals() In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_errors()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.plot_errors() In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Analyze the relation between the target response and the features\natom.plot_partial_dependence(columns=(0, 1, 2, 3))\n
                                                                                                                                                                                                                                                                                                                                                                  # Analyze the relation between the target response and the features atom.plot_partial_dependence(columns=(0, 1, 2, 3))"}, {"location": "examples/regression/#example-regression", "title": "Example: Regression\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to use ATOM to apply pca on the data and run a regression pipeline.

                                                                                                                                                                                                                                                                                                                                                                  Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone. The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/successive_halving/", "title": "Successive halving", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  from sklearn.datasets import fetch_california_housing\nfrom atom import ATOMRegressor\n
                                                                                                                                                                                                                                                                                                                                                                  from sklearn.datasets import fetch_california_housing from atom import ATOMRegressor In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX, y = fetch_california_housing(return_X_y=True)\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X, y = fetch_california_housing(return_X_y=True) In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, y, verbose=2, random_state=1)\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMRegressor(X, y, verbose=2, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Regression.\n\nDataset stats ==================== >>\nShape: (20640, 9)\nTrain set size: 16512\nTest set size: 4128\n-------------------------------------\nMemory: 1.49 MB\nScaled: False\nOutlier values: 786 (0.5%)\n\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Compare tree-based models via successive halving\natom.successive_halving(\n    models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],\n    metric=\"mae\",\n    n_bootstrap=5,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Compare tree-based models via successive halving atom.successive_halving( models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"], metric=\"mae\", n_bootstrap=5, )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nMetric: mae\n\n\nRun: 0 =========================== >>\nModels: Tree6, Bag6, ET6, RF6, LGB6, CatB6\nSize of training set: 16512 (17%)\nSize of test set: 4128\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.5394\nTime elapsed: 0.103s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.576 \u00b1 0.0119\nTime elapsed: 0.422s\n-------------------------------------------------\nTime: 0.525s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1715\nTest evaluation --> mae: -0.4308\nTime elapsed: 0.450s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.435 \u00b1 0.0059\nTime elapsed: 2.061s\n-------------------------------------------------\nTime: 2.511s\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.3977\nTime elapsed: 1.574s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.4059 \u00b1 0.0028\nTime elapsed: 7.107s\n-------------------------------------------------\nTime: 8.681s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1508\nTest evaluation --> mae: -0.4053\nTime elapsed: 4.178s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.4162 \u00b1 0.0031\nTime elapsed: 18.156s\n-------------------------------------------------\nTime: 22.335s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2031\nTest evaluation --> mae: -0.3594\nTime elapsed: 0.438s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3673 \u00b1 0.0016\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 1.324s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1621\nTest evaluation --> mae: -0.3483\nTime elapsed: 5.084s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3554 \u00b1 0.0025\nTime elapsed: 20.177s\n-------------------------------------------------\nTime: 25.261s\n\n\nFinal results ==================== >>\nTotal time: 01m:01s\n-------------------------------------\nDecisionTree --> mae: -0.576 \u00b1 0.0119 ~\nBagging      --> mae: -0.435 \u00b1 0.0059 ~\nExtraTrees   --> mae: -0.4059 \u00b1 0.0028 ~\nRandomForest --> mae: -0.4162 \u00b1 0.0031 ~\nLightGBM     --> mae: -0.3673 \u00b1 0.0016 ~\nCatBoost     --> mae: -0.3554 \u00b1 0.0025 ~ !\n\n\nRun: 1 =========================== >>\nModels: ET3, LGB3, CatB3\nSize of training set: 16512 (33%)\nSize of test set: 4128\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.0\nTest evaluation --> mae: -0.3739\nTime elapsed: 2.738s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3841 \u00b1 0.0027\nTime elapsed: 11.259s\n-------------------------------------------------\nTime: 13.997s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2327\nTest evaluation --> mae: -0.3356\nTime elapsed: 0.389s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.345 \u00b1 0.0037\nTime elapsed: 0.876s\n-------------------------------------------------\nTime: 1.265s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.1882\nTest evaluation --> mae: -0.3255\nTime elapsed: 4.800s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3352 \u00b1 0.0023\nTime elapsed: 22.708s\n-------------------------------------------------\nTime: 27.509s\n\n\nFinal results ==================== >>\nTotal time: 43.130s\n-------------------------------------\nExtraTrees --> mae: -0.3841 \u00b1 0.0027 ~\nLightGBM   --> mae: -0.345 \u00b1 0.0037 ~\nCatBoost   --> mae: -0.3352 \u00b1 0.0023 ~ !\n\n\nRun: 2 =========================== >>\nModels: CatB1\nSize of training set: 16512 (100%)\nSize of test set: 4128\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --> mae: -0.2229\nTest evaluation --> mae: -0.2986\nTime elapsed: 6.851s\nBootstrap ---------------------------------------\nEvaluation --> mae: -0.3091 \u00b1 0.0026\nTime elapsed: 33.428s\n-------------------------------------------------\nTime: 40.279s\n\n\nFinal results ==================== >>\nTotal time: 40.375s\n-------------------------------------\nCatBoost --> mae: -0.3091 \u00b1 0.0026 ~\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The results is now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the number of models fitted during that run\natom.results\n
                                                                                                                                                                                                                                                                                                                                                                  # The results is now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the number of models fitted during that run atom.results Out[5]: mae_train mae_test time_fit mae_bootstrap time_bootstrap time frac model 0.17 Bag6 -0.2017 -0.4327 0.450035 -0.434981 2.061373 2.511408 CatB6 -0.2065 -0.3557 5.083625 -0.355352 20.176994 25.260619 ET6 -0.0694 -0.4077 1.574000 -0.405855 7.106890 8.680890 LGB6 -0.2202 -0.3676 0.438399 -0.367271 0.885806 1.324205 RF6 -0.1851 -0.4165 4.178345 -0.416217 18.156310 22.334655 Tree6 -0.1039 -0.5897 0.102987 -0.575962 0.422224 0.525211 0.33 CatB3 -0.2249 -0.3384 4.800246 -0.335246 22.708465 27.508711 ET3 -0.0935 -0.3879 2.738315 -0.384081 11.258794 13.997109 LGB3 -0.2489 -0.3405 0.389353 -0.344951 0.875797 1.265150 1.00 CatB1 -0.2447 -0.3066 6.851350 -0.309112 33.428059 40.279409 In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Plot the successive halving's results\natom.plot_successive_halving()\n
                                                                                                                                                                                                                                                                                                                                                                  # Plot the successive halving's results atom.plot_successive_halving() In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Use regex to call all the models with the same estimator...\natom.plot_errors(models=[\"CatB.*\"])\n
                                                                                                                                                                                                                                                                                                                                                                  # Use regex to call all the models with the same estimator... atom.plot_errors(models=[\"CatB.*\"]) In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # ...or to call the models from the same run\natom.plot_errors(models=\".*3\")\n
                                                                                                                                                                                                                                                                                                                                                                  # ...or to call the models from the same run atom.plot_errors(models=\".*3\")"}, {"location": "examples/successive_halving/#example-successive-halving", "title": "Example: Successive halving\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to compare multiple tree-based models using successive halving.

                                                                                                                                                                                                                                                                                                                                                                  Import the california housing dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict house prices.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/successive_halving/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/successive_halving/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/successive_halving/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/train_sizing/", "title": "Train sizing", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and prepare the data\natom = ATOMClassifier(X, verbose=2, random_state=1)\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8)\natom.encode()\n
                                                                                                                                                                                                                                                                                                                                                                  # Initialize atom and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8) atom.encode()
                                                                                                                                                                                                                                                                                                                                                                  << ================== ATOM ================== >>\n\nConfiguration ==================== >>\nAlgorithm task: Binary classification.\n\nDataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\n --> Dropping 161 samples for containing more than 16 missing values.\n --> Imputing 481 missing values with median (12.0) in feature MinTemp.\n --> Imputing 265 missing values with median (22.6) in feature MaxTemp.\n --> Imputing 1354 missing values with median (0.0) in feature Rainfall.\n --> Imputing 60682 missing values with median (4.8) in feature Evaporation.\n --> Imputing 67659 missing values with median (8.4) in feature Sunshine.\n --> Imputing 9187 missing values with most_frequent (W) in feature WindGustDir.\n --> Imputing 9127 missing values with median (39.0) in feature WindGustSpeed.\n --> Imputing 9852 missing values with most_frequent (N) in feature WindDir9am.\n --> Imputing 3617 missing values with most_frequent (SE) in feature WindDir3pm.\n --> Imputing 1187 missing values with median (13.0) in feature WindSpeed9am.\n --> Imputing 2469 missing values with median (19.0) in feature WindSpeed3pm.\n --> Imputing 1613 missing values with median (70.0) in feature Humidity9am.\n --> Imputing 3449 missing values with median (52.0) in feature Humidity3pm.\n --> Imputing 13863 missing values with median (1017.6) in feature Pressure9am.\n --> Imputing 13830 missing values with median (1015.2) in feature Pressure3pm.\n --> Imputing 53496 missing values with median (5.0) in feature Cloud9am.\n --> Imputing 56933 missing values with median (5.0) in feature Cloud3pm.\n --> Imputing 743 missing values with median (16.7) in feature Temp9am.\n --> Imputing 2565 missing values with median (21.1) in feature Temp3pm.\n --> Imputing 1354 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --> Target-encoding feature Location. Contains 49 classes.\n --> Target-encoding feature WindGustDir. Contains 16 classes.\n --> Target-encoding feature WindDir9am. Contains 16 classes.\n --> Target-encoding feature WindDir3pm. Contains 16 classes.\n --> Ordinal-encoding feature RainToday. Contains 2 classes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Analyze the impact of the training set's size on a LR model\natom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)\n
                                                                                                                                                                                                                                                                                                                                                                  # Analyze the impact of the training set's size on a LR model atom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nMetric: f1\n\n\nRun: 0 =========================== >>\nModels: LR01\nSize of training set: 11362 (10%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5624\nTest evaluation --> f1: 0.5857\nTime elapsed: 0.721s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0021\nTime elapsed: 0.729s\n-------------------------------------------------\nTime: 1.449s\n\n\nFinal results ==================== >>\nTotal time: 2.053s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0021\n\n\nRun: 1 =========================== >>\nModels: LR02\nSize of training set: 22724 (20%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.582\nTest evaluation --> f1: 0.5874\nTime elapsed: 0.853s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5851 \u00b1 0.002\nTime elapsed: 0.865s\n-------------------------------------------------\nTime: 1.718s\n\n\nFinal results ==================== >>\nTotal time: 2.425s\n-------------------------------------\nLogisticRegression --> f1: 0.5851 \u00b1 0.002\n\n\nRun: 2 =========================== >>\nModels: LR03\nSize of training set: 34087 (30%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5812\nTest evaluation --> f1: 0.585\nTime elapsed: 1.086s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5861 \u00b1 0.0009\nTime elapsed: 1.119s\n-------------------------------------------------\nTime: 2.205s\n\n\nFinal results ==================== >>\nTotal time: 3.035s\n-------------------------------------\nLogisticRegression --> f1: 0.5861 \u00b1 0.0009\n\n\nRun: 3 =========================== >>\nModels: LR04\nSize of training set: 45449 (40%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5828\nTest evaluation --> f1: 0.5862\nTime elapsed: 1.173s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5863 \u00b1 0.0017\nTime elapsed: 1.282s\n-------------------------------------------------\nTime: 2.455s\n\n\nFinal results ==================== >>\nTotal time: 3.365s\n-------------------------------------\nLogisticRegression --> f1: 0.5863 \u00b1 0.0017\n\n\nRun: 4 =========================== >>\nModels: LR05\nSize of training set: 56812 (50%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5823\nTest evaluation --> f1: 0.5853\nTime elapsed: 1.264s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.460s\n-------------------------------------------------\nTime: 2.724s\n\n\nFinal results ==================== >>\nTotal time: 3.758s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0016\n\n\nRun: 5 =========================== >>\nModels: LR06\nSize of training set: 68174 (60%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5835\nTest evaluation --> f1: 0.5843\nTime elapsed: 1.392s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.704s\n-------------------------------------------------\nTime: 3.095s\n\n\nFinal results ==================== >>\nTotal time: 4.151s\n-------------------------------------\nLogisticRegression --> f1: 0.585 \u00b1 0.0016\n\n\nRun: 6 =========================== >>\nModels: LR07\nSize of training set: 79536 (70%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5872\nTest evaluation --> f1: 0.5846\nTime elapsed: 1.585s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5852 \u00b1 0.0013\nTime elapsed: 1.836s\n-------------------------------------------------\nTime: 3.421s\n\n\nFinal results ==================== >>\nTotal time: 4.664s\n-------------------------------------\nLogisticRegression --> f1: 0.5852 \u00b1 0.0013\n\n\nRun: 7 =========================== >>\nModels: LR08\nSize of training set: 90899 (80%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5889\nTest evaluation --> f1: 0.5841\nTime elapsed: 1.693s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5852 \u00b1 0.0025\nTime elapsed: 2.139s\n-------------------------------------------------\nTime: 3.832s\n\n\nFinal results ==================== >>\nTotal time: 5.157s\n-------------------------------------\nLogisticRegression --> f1: 0.5852 \u00b1 0.0025\n\n\nRun: 8 =========================== >>\nModels: LR09\nSize of training set: 102261 (90%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5871\nTest evaluation --> f1: 0.5837\nTime elapsed: 1.754s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5844 \u00b1 0.0022\nTime elapsed: 2.353s\n-------------------------------------------------\nTime: 4.107s\n\n\nFinal results ==================== >>\nTotal time: 5.464s\n-------------------------------------\nLogisticRegression --> f1: 0.5844 \u00b1 0.0022\n\n\nRun: 9 =========================== >>\nModels: LR10\nSize of training set: 113624 (100%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f1: 0.5856\nTest evaluation --> f1: 0.585\nTime elapsed: 1.978s\nBootstrap ---------------------------------------\nEvaluation --> f1: 0.5846 \u00b1 0.0005\nTime elapsed: 2.544s\n-------------------------------------------------\nTime: 4.521s\n\n\nFinal results ==================== >>\nTotal time: 5.975s\n-------------------------------------\nLogisticRegression --> f1: 0.5846 \u00b1 0.0005\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # The results are now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the fraction as well (without the dot)\natom.results\n
                                                                                                                                                                                                                                                                                                                                                                  # The results are now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the fraction as well (without the dot) atom.results Out[5]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time frac model 0.1 LR01 0.5622 0.5852 0.720655 0.585044 0.728664 1.449319 0.2 LR02 0.5830 0.5845 0.852776 0.585144 0.864794 1.717570 0.3 LR03 0.5795 0.5856 1.085709 0.586101 1.119410 2.205119 0.4 LR04 0.5847 0.5858 1.173066 0.586305 1.282166 2.455232 0.5 LR05 0.5836 0.5862 1.264150 0.585003 1.460329 2.724479 0.6 LR06 0.5832 0.5833 1.391943 0.584966 1.703550 3.095493 0.7 LR07 0.5880 0.5856 1.585444 0.585199 1.835532 3.420976 0.8 LR08 0.5914 0.5882 1.693054 0.585235 2.138652 3.831706 0.9 LR09 0.5854 0.5828 1.753595 0.584420 2.353141 4.106736 1.0 LR10 0.5862 0.5850 1.977799 0.584634 2.543574 4.521373 In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Every model can be accessed through its name\natom.lr05.plot_shap_waterfall(show=6)\n
                                                                                                                                                                                                                                                                                                                                                                  # Every model can be accessed through its name atom.lr05.plot_shap_waterfall(show=6) In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Plot the train sizing's results\natom.plot_learning_curve()\n
                                                                                                                                                                                                                                                                                                                                                                  # Plot the train sizing's results atom.plot_learning_curve()"}, {"location": "examples/train_sizing/#example-train-sizing", "title": "Example: Train sizing\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows how to asses a model's performance based on the size of the training set.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/train_sizing/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/train_sizing/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/train_sizing/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/utilities/", "title": "Utilities", "text": "In\u00a0[1]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Import packages\nimport tempfile\nimport pandas as pd\nfrom sklearn.metrics import fbeta_score\nfrom atom import ATOMClassifier\n
                                                                                                                                                                                                                                                                                                                                                                  # Import packages import tempfile import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier In\u00a0[2]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n
                                                                                                                                                                                                                                                                                                                                                                  # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\") # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0

                                                                                                                                                                                                                                                                                                                                                                  5 rows \u00d7 22 columns

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[3]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, random_state=1)\natom.clean()\n\n# Quickly check what columns have missing values\nprint(f\"Columns with missing values:\\n{atom.nans}\")\n\n# Or what columns are categorical\nprint(f\"\\nCategorical columns: {atom.categorical}\")\n\n# Or if the dataset is scaled\nprint(f\"\\nIs the dataset scaled? {atom.scaled}\")\n
                                                                                                                                                                                                                                                                                                                                                                  atom = ATOMClassifier(X, random_state=1) atom.clean() # Quickly check what columns have missing values print(f\"Columns with missing values:\\n{atom.nans}\") # Or what columns are categorical print(f\"\\nCategorical columns: {atom.categorical}\") # Or if the dataset is scaled print(f\"\\nIs the dataset scaled? {atom.scaled}\")
                                                                                                                                                                                                                                                                                                                                                                  Columns with missing values:\nLocation             0\nMinTemp            637\nMaxTemp            322\nRainfall          1406\nEvaporation      60843\nSunshine         67816\nWindGustDir       9330\nWindGustSpeed     9270\nWindDir9am       10013\nWindDir3pm        3778\nWindSpeed9am      1348\nWindSpeed3pm      2630\nHumidity9am       1774\nHumidity3pm       3610\nPressure9am      14014\nPressure3pm      13981\nCloud9am         53657\nCloud3pm         57094\nTemp9am            904\nTemp3pm           2726\nRainToday         1406\ndtype: int64\n\nCategorical columns: Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], dtype='object')\n\nIs the dataset scaled? False\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[4]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note the number of missing values and categorical columns\natom.stats()\n
                                                                                                                                                                                                                                                                                                                                                                  # Note the number of missing values and categorical columns atom.stats()
                                                                                                                                                                                                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 27.44 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[5]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Now, let's impute and encode the dataset...\natom.impute()\natom.encode()\n\n# ... and the values are gone\natom.stats()\n
                                                                                                                                                                                                                                                                                                                                                                  # Now, let's impute and encode the dataset... atom.impute() atom.encode() # ... and the values are gone atom.stats()
                                                                                                                                                                                                                                                                                                                                                                  Dataset stats ==================== >>\nShape: (56420, 22)\nTrain set size: 45075\nTest set size: 11345\n-------------------------------------\nMemory: 11.11 MB\nScaled: False\nOutlier values: 3203 (0.3%)\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[6]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Compare the relationship of multiple columns with a scatter maxtrix\natom.plot_relationships(columns=slice(0, 5))\n
                                                                                                                                                                                                                                                                                                                                                                  # Compare the relationship of multiple columns with a scatter maxtrix atom.plot_relationships(columns=slice(0, 5)) In\u00a0[7]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Check which distribution fits a column best\natom.distribution(columns=\"Rainfall\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Check which distribution fits a column best atom.distribution(columns=\"Rainfall\") Out[7]: Rainfall dist stat beta score 0.6506 p_value 0.0 expon score 0.6506 p_value 0.0 gamma score 0.6465 p_value 0.0 invgauss score 0.6257 p_value 0.0 lognorm score 0.6485 p_value 0.0 norm score 0.3807 p_value 0.0 pearson3 score 0.6506 p_value 0.0 triang score 0.7191 p_value 0.0 uniform score 0.8914 p_value 0.0 weibull_min score 0.6506 p_value 0.0 weibull_max score 0.8896 p_value 0.0 In\u00a0[8]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Investigate a column's distribution\natom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\")\natom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")\n
                                                                                                                                                                                                                                                                                                                                                                  # Investigate a column's distribution atom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\") atom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")

                                                                                                                                                                                                                                                                                                                                                                  There are two ways to quickly transform the dataset mid-pipeline. The first way is through the property's @setter. The downside for this approach is that the transformation is not stored in atom's pipeline, so the transformation is not applied on new data. Therefore, we recommend using the second approach, through the add method.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[9]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that we can only replace a dataframe with a new dataframe!\natom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)\n\n# This will automatically update all other data attributes\nassert \"AvgTemp\" in atom\n\n# But it's not saved to atom's pipeline\natom.pipeline\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that we can only replace a dataframe with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2) # This will automatically update all other data attributes assert \"AvgTemp\" in atom # But it's not saved to atom's pipeline atom.pipeline Out[9]:
                                                                                                                                                                                                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])
                                                                                                                                                                                                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline
                                                                                                                                                                                                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])
                                                                                                                                                                                                                                                                                                                                                                  Cleaner
                                                                                                                                                                                                                                                                                                                                                                  Cleaner()
                                                                                                                                                                                                                                                                                                                                                                  Imputer
                                                                                                                                                                                                                                                                                                                                                                  Imputer()
                                                                                                                                                                                                                                                                                                                                                                  Encoder
                                                                                                                                                                                                                                                                                                                                                                  Encoder(value='rare')
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[10]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Same transformation, different approach (AvgTemp is overwritten)\ndef transform(df):\n    df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2\n    return df\n\natom.apply(transform)\n\nassert \"AvgTemp\" in atom\n
                                                                                                                                                                                                                                                                                                                                                                  # Same transformation, different approach (AvgTemp is overwritten) def transform(df): df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2 return df atom.apply(transform) assert \"AvgTemp\" in atom In\u00a0[11]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Now the function appears in the pipeline\natom.pipeline\n
                                                                                                                                                                                                                                                                                                                                                                  # Now the function appears in the pipeline atom.pipeline Out[11]:
                                                                                                                                                                                                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=<function transform at 0x0000016745DF6B90>))])
                                                                                                                                                                                                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline
                                                                                                                                                                                                                                                                                                                                                                  Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=<function transform at 0x0000016745DF6B90>))])
                                                                                                                                                                                                                                                                                                                                                                  Cleaner
                                                                                                                                                                                                                                                                                                                                                                  Cleaner()
                                                                                                                                                                                                                                                                                                                                                                  Imputer
                                                                                                                                                                                                                                                                                                                                                                  Imputer()
                                                                                                                                                                                                                                                                                                                                                                  Encoder
                                                                                                                                                                                                                                                                                                                                                                  Encoder(value='rare')
                                                                                                                                                                                                                                                                                                                                                                  FunctionTransformer
                                                                                                                                                                                                                                                                                                                                                                  FunctionTransformer(func=<function transform at 0x0000016745DF6B90>)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[12]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.available_models()\n
                                                                                                                                                                                                                                                                                                                                                                  atom.available_models() Out[12]: acronym model estimator module needs_scaling accepts_sparse native_multilabel native_multioutput has_validation supports_engines 0 AdaB AdaBoost AdaBoostClassifier sklearnensemble False True False False False sklearn 1 Bag Bagging BaggingClassifier sklearnensemble False True False False False sklearn 2 BNB BernoulliNB BernoulliNB sklearnnaive_bayes False True False False False sklearn, cuml 3 CatB CatBoost CatBoostClassifier catboostcatboost True True False False True catboost 4 CatNB CategoricalNB CategoricalNB sklearnnaive_bayes False True False False False sklearn, cuml 5 CNB ComplementNB ComplementNB sklearnnaive_bayes False True False False False sklearn, cuml 6 Tree DecisionTree DecisionTreeClassifier sklearntree False True True True False sklearn 7 Dummy Dummy DummyClassifier sklearndummy False False False False False sklearn 8 ETree ExtraTree ExtraTreeClassifier sklearntree False True True True False sklearn 9 ET ExtraTrees ExtraTreesClassifier sklearnensemble False True True True False sklearn 10 GNB GaussianNB GaussianNB sklearnnaive_bayes False False False False False sklearn, cuml 11 GP GaussianProcess GaussianProcessClassifier sklearngaussian_process False False False False False sklearn 12 GBM GradientBoostingMachine GradientBoostingClassifier sklearnensemble False True False False False sklearn 13 hGBM HistGradientBoosting HistGradientBoostingClassifier sklearnensemble False False False False False sklearn 14 KNN KNearestNeighbors KNeighborsClassifier sklearnneighbors True True True True False sklearn, sklearnex, cuml 15 LGB LightGBM LGBMClassifier lightgbmlightgbm.sklearn True True False False True lightgbm 16 LDA LinearDiscriminantAnalysis LinearDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 17 lSVM LinearSVM LinearSVC sklearnsvm True True False False False sklearn, cuml 18 LR LogisticRegression LogisticRegression sklearnlinear_model True True False False False sklearn, sklearnex, cuml 19 MLP MultiLayerPerceptron MLPClassifier sklearnneural_network True True True False True sklearn 20 MNB MultinomialNB MultinomialNB sklearnnaive_bayes False True False False False sklearn, cuml 21 PA PassiveAggressive PassiveAggressiveClassifier sklearnlinear_model True True False False True sklearn 22 Perc Perceptron Perceptron sklearnlinear_model True False False False True sklearn 23 QDA QuadraticDiscriminantAnalysis QuadraticDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 24 RNN RadiusNearestNeighbors RadiusNeighborsClassifier sklearnneighbors True True True True False sklearn 25 RF RandomForest RandomForestClassifier sklearnensemble False True True True False sklearn, sklearnex, cuml 26 Ridge Ridge RidgeClassifier sklearnlinear_model True True True False False sklearn, sklearnex, cuml 27 SGD StochasticGradientDescent SGDClassifier sklearnlinear_model True True False False True sklearn 28 SVM SupportVectorMachine SVC sklearnsvm True True False False False sklearn, sklearnex, cuml 29 XGB XGBoost XGBClassifier xgboostxgboost True True False False True xgboost In\u00a0[13]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.verbose = 1\n\n# Define a custom metric\ndef f2(y_true, y_pred):\n    return fbeta_score(y_true, y_pred, beta=2)\n\n# Use the greater_is_better, needs_proba and needs_threshold parameters if necessary\natom.run(models=\"LR\", metric=f2)\n
                                                                                                                                                                                                                                                                                                                                                                  atom.verbose = 1 # Define a custom metric def f2(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Use the greater_is_better, needs_proba and needs_threshold parameters if necessary atom.run(models=\"LR\", metric=f2)
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: LR\nMetric: f2\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.5693\nTest evaluation --> f2: 0.5709\nTime elapsed: 0.863s\n-------------------------------------------------\nTime: 0.863s\n\n\nFinal results ==================== >>\nTotal time: 1.491s\n-------------------------------------\nLogisticRegression --> f2: 0.5709\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[14]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # You can use the est_params parameter to customize the estimator\n# Let's run AdaBoost using LR instead of a decision tree as base estimator\natom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})\n
                                                                                                                                                                                                                                                                                                                                                                  # You can use the est_params parameter to customize the estimator # Let's run AdaBoost using LR instead of a decision tree as base estimator atom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: AdaB\nMetric: f2\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.556\nTest evaluation --> f2: 0.5636\nTime elapsed: 2.568s\n-------------------------------------------------\nTime: 2.568s\n\n\nFinal results ==================== >>\nTotal time: 3.065s\n-------------------------------------\nAdaBoost --> f2: 0.5636\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[15]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom.adab.estimator\n
                                                                                                                                                                                                                                                                                                                                                                  atom.adab.estimator Out[15]:
                                                                                                                                                                                                                                                                                                                                                                  AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.AdaBoostClassifier
                                                                                                                                                                                                                                                                                                                                                                  AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  base_estimator: LogisticRegression
                                                                                                                                                                                                                                                                                                                                                                  LogisticRegression(n_jobs=1, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  LogisticRegression
                                                                                                                                                                                                                                                                                                                                                                  LogisticRegression(n_jobs=1, random_state=1)
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[16]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that parameters specified by est_params are not optimized in the BO\natom.run(\n    models=\"Tree\",\n    n_trials=10,\n    est_params={\n        \"criterion\": \"gini\",\n        \"splitter\": \"best\",\n        \"min_samples_leaf\": 1,\n        \"ccp_alpha\": 0.035,\n    },\n    verbose=2,\n)\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that parameters specified by est_params are not optimized in the BO atom.run( models=\"Tree\", n_trials=10, est_params={ \"criterion\": \"gini\", \"splitter\": \"best\", \"min_samples_leaf\": 1, \"ccp_alpha\": 0.035, }, verbose=2, )
                                                                                                                                                                                                                                                                                                                                                                  \nTraining ========================= >>\nModels: Tree\nMetric: f2\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial | max_depth | min_samples_split | max_features |      f2 | best_f2 | time_trial | time_ht |    state |\n| ----- | --------- | ----------------- | ------------ | ------- | ------- | ---------- | ------- | -------- |\n| 0     |        13 |                12 |          0.5 |  0.4362 |  0.4362 |     3.161s |  3.161s | COMPLETE |\n| 1     |        14 |                16 |         log2 |  0.4729 |  0.4729 |     2.872s |  6.033s | COMPLETE |\n| 2     |        16 |                13 |          0.8 |  0.4626 |  0.4729 |     3.201s |  9.234s | COMPLETE |\n| 3     |         9 |                 6 |         None |  0.4903 |  0.4903 |     3.075s | 12.309s | COMPLETE |\n| 4     |         5 |                 2 |         log2 |  0.4889 |  0.4903 |     2.812s | 15.121s | COMPLETE |\n| 5     |         1 |                15 |          0.5 |  0.4953 |  0.4953 |     2.827s | 17.948s | COMPLETE |\n| 6     |        15 |                 9 |         sqrt |  0.5004 |  0.5004 |     2.951s | 20.899s | COMPLETE |\n| 7     |        13 |                20 |         None |  0.5004 |  0.5004 |     3.242s | 24.141s | COMPLETE |\n| 8     |         3 |                19 |          0.5 |  0.4936 |  0.5004 |     2.800s | 26.941s | COMPLETE |\n| 9     |        15 |                20 |         sqrt |  0.4762 |  0.5004 |     3.170s | 30.111s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --> 6\nBest parameters:\n --> max_depth: 15\n --> min_samples_split: 9\n --> max_features: sqrt\nBest evaluation --> f2: 0.5004\nTime elapsed: 30.111s\nFit ---------------------------------------------\nTrain evaluation --> f2: 0.4925\nTest evaluation --> f2: 0.4925\nTime elapsed: 0.452s\n-------------------------------------------------\nTime: 30.563s\n\n\nFinal results ==================== >>\nTotal time: 30.885s\n-------------------------------------\nDecisionTree --> f2: 0.4925\n

                                                                                                                                                                                                                                                                                                                                                                  Note that both instances need to be initialized with the same data and use the same metric for model training to be able to merge.

                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[17]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  tempdir = tempfile.gettempdir()\n
                                                                                                                                                                                                                                                                                                                                                                  tempdir = tempfile.gettempdir() In\u00a0[18]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Save the atom instance as a pickle\n# Use save_data=False to save the instance without the data\natom.save(tempdir + \"atom\", save_data=False)\n
                                                                                                                                                                                                                                                                                                                                                                  # Save the atom instance as a pickle # Use save_data=False to save the instance without the data atom.save(tempdir + \"atom\", save_data=False)
                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier successfully saved.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[20]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # No need to store the transformed data, providing the original dataset to\n# the loader automatically transforms it through all the steps in the pipeline\natom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))\n
                                                                                                                                                                                                                                                                                                                                                                  # No need to store the transformed data, providing the original dataset to # the loader automatically transforms it through all the steps in the pipeline atom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))
                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier successfully loaded.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[21]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Create a separate instance with its own branch and model\natom_3 = ATOMClassifier(X, verbose=0, random_state=1)\natom_3.branch.name = \"lightgbm\"\natom_3.impute()\natom_3.encode()\natom_3.run(\"LGB\", metric=f2)\n
                                                                                                                                                                                                                                                                                                                                                                  # Create a separate instance with its own branch and model atom_3 = ATOMClassifier(X, verbose=0, random_state=1) atom_3.branch.name = \"lightgbm\" atom_3.impute() atom_3.encode() atom_3.run(\"LGB\", metric=f2) In\u00a0[22]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Merge the instances\natom_2.merge(atom_3)\n
                                                                                                                                                                                                                                                                                                                                                                  # Merge the instances atom_2.merge(atom_3)
                                                                                                                                                                                                                                                                                                                                                                  Merging instances...\n --> Merging branch lightgbm.\n --> Merging model LGB.\n --> Merging attributes.\n
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[23]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  # Note that it now contains both branches and all models\natom_2\n
                                                                                                                                                                                                                                                                                                                                                                  # Note that it now contains both branches and all models atom_2 Out[23]:
                                                                                                                                                                                                                                                                                                                                                                  ATOMClassifier\n --> Branches:\n   --> main !\n   --> lightgbm\n --> Models: LR, AdaB, Tree, LGB\n --> Metric: f2
                                                                                                                                                                                                                                                                                                                                                                  In\u00a0[24]: Copied!
                                                                                                                                                                                                                                                                                                                                                                  atom_2.results\n
                                                                                                                                                                                                                                                                                                                                                                  atom_2.results Out[24]: f2_train f2_test time_fit time frac model 0.8 AdaB 0.5599 0.5590 2.568021 2.568021 LR 0.5723 0.5685 0.863496 0.863496 Tree 0.4930 0.4928 0.452411 30.563017 1.0 LGB 0.6578 0.5909 3.991159 3.991159"}, {"location": "examples/utilities/#example-utilities", "title": "Example: Utilities\u00b6", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This example shows various useful utilities that can be used to improve atom's pipelines.

                                                                                                                                                                                                                                                                                                                                                                  The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target RainTomorrow.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "examples/utilities/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-utility-attributes", "title": "Use the utility attributes\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-stats-method-to-assess-changes-in-the-dataset", "title": "Use the stats method to assess changes in the dataset\u00b6", "text": ""}, {"location": "examples/utilities/#inspect-feature-distributions", "title": "Inspect feature distributions\u00b6", "text": ""}, {"location": "examples/utilities/#change-the-data-mid-pipeline", "title": "Change the data mid-pipeline\u00b6", "text": ""}, {"location": "examples/utilities/#get-an-overview-of-the-available-models", "title": "Get an overview of the available models\u00b6", "text": ""}, {"location": "examples/utilities/#use-a-custom-metric", "title": "Use a custom metric\u00b6", "text": ""}, {"location": "examples/utilities/#customize-the-estimators-parameters", "title": "Customize the estimator's parameters\u00b6", "text": ""}, {"location": "examples/utilities/#save-load", "title": "Save & load\u00b6", "text": ""}, {"location": "user_guide/accelerating/", "title": "Accelerating pipelines", "text": "

                                                                                                                                                                                                                                                                                                                                                                  For very large datasets, ATOM offers various ways to accelerate its pipeline:

                                                                                                                                                                                                                                                                                                                                                                  • Run estimators on GPU
                                                                                                                                                                                                                                                                                                                                                                  • Use a faster data engine
                                                                                                                                                                                                                                                                                                                                                                  • Use a faster estimator engine
                                                                                                                                                                                                                                                                                                                                                                  • Run processes in parallel

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Performance improvements are usually noticeable for datasets larger than ~5M rows. For smaller datasets, using other values than the default can even harm performance!

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#gpu-acceleration", "title": "GPU acceleration", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Graphics Processing Units (GPUs) can significantly accelerate calculations for preprocessing steps or training machine learning models. Training models involves compute-intensive matrix multiplications and other operations that can take advantage of a GPU's massively parallel architecture. Training on large datasets can take hours to run on a single processor. However, if you offload those tasks to a GPU, you can reduce training time to minutes instead.

                                                                                                                                                                                                                                                                                                                                                                  Running transformers and models in atom using a GPU is as easy as initializing the instance with parameter device=\"gpu\". The device parameter accepts any string that follows the SYCL_DEVICE_FILTER filter selector. Examples are:

                                                                                                                                                                                                                                                                                                                                                                  • device=\"cpu\" (use CPU)
                                                                                                                                                                                                                                                                                                                                                                  • device=\"gpu\" (use default GPU)
                                                                                                                                                                                                                                                                                                                                                                  • device=\"gpu:0\" (use first GPU)
                                                                                                                                                                                                                                                                                                                                                                  • device=\"gpu:1\" (use second GPU)

                                                                                                                                                                                                                                                                                                                                                                  Combine GPU acceleration with the cuml and sklearnex estimator engines. The XGBoost, LightGBM and CatBoost models come with their own GPU engine. Setting device=\"gpu\" is sufficient to accelerate them with GPU, regardless of the engine parameter.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  ATOM does not support multi-GPU training. If there is more than one GPU on the machine and the device parameter does not specify which one to use, the first one is used by default.

                                                                                                                                                                                                                                                                                                                                                                  Example

                                                                                                                                                                                                                                                                                                                                                                  Train a model on a GPU yourself using SageMaker Studio Lab. Just click on the badge above and run the notebook! Make sure to choose the GPU compute type.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#data-acceleration", "title": "Data acceleration", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The data engine can be specified through the engine parameter, which takes a dict with a key data that accepts three values: numpy, pyarrow and modin.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#numpy", "title": "numpy", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM uses pandas as the default library for data handling, which in turn, uses numpy for all data processing.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#pyarrow", "title": "pyarrow", "text": "

                                                                                                                                                                                                                                                                                                                                                                  PyArrow is a library that provides a way to work with Apache Arrow memory structures. Apache Arrow is a cross-language, platform-independent, in-memory data format that provides an efficient and fast way to serialize and deserialize data. Pandas offers native integration with pyarrow, which atom uses when specifying the pyarrow data engine.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  • The pyarrow backend doesn't work for sparse datasets. If the dataset has any sparse columns, an exception is raised.
                                                                                                                                                                                                                                                                                                                                                                  • The LightGBM and XGBoost models don't support pyarrow dtypes.
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#modin", "title": "modin", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The modin library is a multi-threading, drop-in replacement for pandas, that uses Ray as backend.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#estimator-acceleration", "title": "Estimator acceleration", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The estimator engine can be specified through the engine parameter, which takes a dict with a key estimator that accepts three values: sklearn, sklearnex and cuml. Read here how to run the estimators on GPU instead of CPU.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Estimators accelerated with sklearnex or cuML sometimes use slightly different hyperparameters than their sklearn counterparts.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#sklearn", "title": "sklearn", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This is the default option, which uses the standard estimators from sklearn. Sklearn does not support training on GPU.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#sklearnex", "title": "sklearnex", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The Intel\u00ae Extension for Scikit-learn package (or sklearnex, for brevity) accelerates sklearn models and transformers, keeping full conformance with sklearn's API. Sklearnex is a free software AI accelerator that offers a way to make sklearn code 10\u2013100 times faster. The software acceleration is achieved through the use of vector instructions, IA hardware-specific memory optimizations, threading, and optimizations for all upcoming Intel platforms at launch time. See here an example using the sklearnex engine.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  sklearnex estimators don't support sparse datasets nor multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Intel\u00ae processors provide better performance than other CPUs.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#prerequisites", "title": "Prerequisites", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Operating System:
                                                                                                                                                                                                                                                                                                                                                                    • Linux (Ubuntu, Fedora, etc...)
                                                                                                                                                                                                                                                                                                                                                                    • Windows 8.1+
                                                                                                                                                                                                                                                                                                                                                                    • macOS (no GPU support)
                                                                                                                                                                                                                                                                                                                                                                  • CPU:
                                                                                                                                                                                                                                                                                                                                                                    • Processor must have x86 architecture.
                                                                                                                                                                                                                                                                                                                                                                    • Processor must support at least one of SSE2, AVX, AVX2, AVX512 instruction sets.
                                                                                                                                                                                                                                                                                                                                                                    • ARM* architecture is not supported.
                                                                                                                                                                                                                                                                                                                                                                  • GPU:
                                                                                                                                                                                                                                                                                                                                                                    • All Intel\u00ae integrated and discrete GPUs.
                                                                                                                                                                                                                                                                                                                                                                    • Intel\u00ae GPU drivers.
                                                                                                                                                                                                                                                                                                                                                                  • Libraries:
                                                                                                                                                                                                                                                                                                                                                                    • sklearnex>=2023.2.1 (automatically installed with atom when the processor has x86 architecture)
                                                                                                                                                                                                                                                                                                                                                                    • dpcpp_cpp_rt>=2023.2 (only for GPU acceleration)
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#supported-estimators", "title": "Supported estimators", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Pruner (only for strategy=\"dbscan\")
                                                                                                                                                                                                                                                                                                                                                                  • FeatureSelector (only for strategy=\"pca\" and dense datasets)

                                                                                                                                                                                                                                                                                                                                                                  • ElasticNet (only for CPU acceleration)

                                                                                                                                                                                                                                                                                                                                                                  • KNearestNeighbors
                                                                                                                                                                                                                                                                                                                                                                  • Lasso (only for CPU acceleration)
                                                                                                                                                                                                                                                                                                                                                                  • LogisticRegression
                                                                                                                                                                                                                                                                                                                                                                  • OrdinaryLeastSquares
                                                                                                                                                                                                                                                                                                                                                                  • RandomForest
                                                                                                                                                                                                                                                                                                                                                                  • Ridge (only for regression tasks and CPU acceleration)
                                                                                                                                                                                                                                                                                                                                                                  • SupportVectorMachine (GPU acceleration only supports classification tasks)
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#cuml", "title": "cuML", "text": "

                                                                                                                                                                                                                                                                                                                                                                  cuML is the machine learning library of the RAPIDS project. cuML enables you to run traditional tabular ML tasks on GPUs without going into the details of CUDA programming. For large datasets, these GPU-based implementations can complete 10-50x faster than their CPU equivalents.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  • cuML estimators don't support multioutput tasks nor the pyarrow data engine.
                                                                                                                                                                                                                                                                                                                                                                  • Install cuML using pip install --extra-index-url=https://pypi.nvidia.com cuml-cu11 or pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12 depending on your CUDA version. Read more about RAPIDS' installation here.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Only transformers and predictors are converted to the requested engine. To use a metric from cuML, insert it directly in the run method:

                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMClassifier\nfrom cuml.metrics import accuracy_score\nfrom sklearn.datasets import make_classification\n\nX, y = make_classification(n_samples=100, random_state=1)\n\natom = ATOMClassifier(X, y, engine={\"estimator\": \"cuml\"}, verbose=2)\natom.run(\"LR\", metric=accuracy_score)\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#prerequisites_1", "title": "Prerequisites", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Operating System:
                                                                                                                                                                                                                                                                                                                                                                    • Ubuntu 18.04/20.04 or CentOS 7/8 with gcc/++ 9.0+
                                                                                                                                                                                                                                                                                                                                                                    • Windows 10+ with WSL2 (see here a tutorial)
                                                                                                                                                                                                                                                                                                                                                                  • GPU:
                                                                                                                                                                                                                                                                                                                                                                    • NVIDIA Pascal\u2122 or better with compute capability 6.0+
                                                                                                                                                                                                                                                                                                                                                                  • Drivers:
                                                                                                                                                                                                                                                                                                                                                                    • CUDA & NVIDIA Drivers of versions 11.0, 11.2, 11.4 or 11.5
                                                                                                                                                                                                                                                                                                                                                                  • Libraries:
                                                                                                                                                                                                                                                                                                                                                                    • cuML>=23.08
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#supported-estimators_1", "title": "Supported estimators", "text": "
                                                                                                                                                                                                                                                                                                                                                                  • Cleaner
                                                                                                                                                                                                                                                                                                                                                                  • Discretizer
                                                                                                                                                                                                                                                                                                                                                                  • Imputer (only for strat_num!=\"knn\")
                                                                                                                                                                                                                                                                                                                                                                  • Normalizer
                                                                                                                                                                                                                                                                                                                                                                  • Pruner (only for strategy=\"dbscan\" and \"hdbscan\")
                                                                                                                                                                                                                                                                                                                                                                  • Scaler
                                                                                                                                                                                                                                                                                                                                                                  • Vectorizer
                                                                                                                                                                                                                                                                                                                                                                  • FeatureSelector (only for strategy=\"pca\")

                                                                                                                                                                                                                                                                                                                                                                  • BernoulliNB

                                                                                                                                                                                                                                                                                                                                                                  • CategoricalNB
                                                                                                                                                                                                                                                                                                                                                                  • ElasticNet
                                                                                                                                                                                                                                                                                                                                                                  • GaussianNB
                                                                                                                                                                                                                                                                                                                                                                  • KNearestNeighbors
                                                                                                                                                                                                                                                                                                                                                                  • Lasso
                                                                                                                                                                                                                                                                                                                                                                  • LinearSVM
                                                                                                                                                                                                                                                                                                                                                                  • LogisticRegression
                                                                                                                                                                                                                                                                                                                                                                  • MultinomialNB
                                                                                                                                                                                                                                                                                                                                                                  • OrdinaryLeastSquares
                                                                                                                                                                                                                                                                                                                                                                  • RandomForest
                                                                                                                                                                                                                                                                                                                                                                  • Ridge (only for regression tasks)
                                                                                                                                                                                                                                                                                                                                                                  • SupportVectorMachine
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/accelerating/#parallel-execution", "title": "Parallel execution", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Another way to accelerate your pipelines is executing processes in parallel. Use the backend parameter to select one of several parallelization backends.

                                                                                                                                                                                                                                                                                                                                                                  • loky: Used by default, can induce some communication and memory overhead when exchanging input and output data with the worker Python processes. On some rare systems (such as Pyiodide), the loky backend may not be available.
                                                                                                                                                                                                                                                                                                                                                                  • multiprocessing: Previous process-based backend based on multiprocessing.Pool. Less robust than loky.
                                                                                                                                                                                                                                                                                                                                                                  • threading: Very low-overhead backend but it suffers from the Python Global Interpreter Lock if the called function relies a lot on Python objects. It's mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a \"with nogil\" block or an expensive call to a library such as numpy).
                                                                                                                                                                                                                                                                                                                                                                  • ray: Ray is an open-source unified compute framework that makes it easy to scale AI and Python workloads. Read more about Ray here. See here an example use case.

                                                                                                                                                                                                                                                                                                                                                                  The parallelization backend is applied in the following cases:

                                                                                                                                                                                                                                                                                                                                                                  • In every individual estimator that uses parallelization internally.
                                                                                                                                                                                                                                                                                                                                                                  • To calculate cross-validated results during hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • To train multiple models in parallel (when the trainer's parallel parameter is True).
                                                                                                                                                                                                                                                                                                                                                                  • To calculate partial dependencies in plot_partial_dependence.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  The njobs parameter sets the number of cores for the individual models as well as for parallel training. You won't gain much training two models in parallel with 2 cores, when the models also parallelize computations internally. Instead, use parallel training for models that can't parallelize their training (their constructor doesn't have the n_jobs parameter).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/", "title": "Data cleaning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  More often than not, you'll need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  • All of atom's data cleaning methods automatically adopt the relevant transformer attributes (n_jobs, verbose, logger, random_state) from atom. A different choice can be added as parameter to the method call, e.g., atom.scale(verbose=2).
                                                                                                                                                                                                                                                                                                                                                                  • Like the add method, the data cleaning methods accept the columns parameter to only transform a subset of the dataset's features, e.g., atom.scale(columns=[0, 1]). Read more in the row and column selection section.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#balancing-the-data", "title": "Balancing the data", "text": "

                                                                                                                                                                                                                                                                                                                                                                  One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud, and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority class or undersample the majority class using any of the transformers implemented in the imblearn package. It can be accessed from atom through the balance method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#standard-data-cleaning", "title": "Standard data cleaning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  There are many data cleaning steps that are useful to perform on any dataset before modeling. These are general rules that apply almost on every use-case and every task. The Cleaner class is a convenient tool to apply such steps. It can be accessed from atom through the clean method. Use the class' parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                                                                                                                                                                                                  • Drop columns with specific data types.
                                                                                                                                                                                                                                                                                                                                                                  • Strip categorical features from white spaces.
                                                                                                                                                                                                                                                                                                                                                                  • Drop duplicate rows.
                                                                                                                                                                                                                                                                                                                                                                  • Drop rows with missing values in the target column.
                                                                                                                                                                                                                                                                                                                                                                  • Encode the target column.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#binning-numerical-features", "title": "Binning numerical features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values. Certain datasets with continuous features may benefit from discretization, because discretization can transform the dataset of continuous attributes to one with only nominal attributes. Discretization is similar to constructing histograms for continuous data. However, histograms focus on counting features which fall into particular bins, whereas discretization focuses on assigning feature values to these bins. The Discretizer class can be used to bin continuous data into intervals. It can be accessed from atom through the discretize method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#encoding-categorical-features", "title": "Encoding categorical features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Many datasets contain categorical features. Their variables are typically stored as text values which represent various classes. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. The majority of sklearn's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method.

                                                                                                                                                                                                                                                                                                                                                                  There are many strategies to encode categorical columns. The Encoder class applies one strategy or another depending on the number of classes in the column to be encoded. When there are only two, the values are encoded with 0 or 1. When there are more than two, the columns can be encoded using one-hot encoding or any other strategy of the category-encoders package, depending on the value of the max_onehot parameter. One-hot encodes the column making a dummy feature for every class. This approach preserves all the information but increases the size of the dataset considerably, making it often an undesirable strategy for high cardinality features. Other strategies like Target transform the column in place.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#imputing-missing-values", "title": "Imputing missing values", "text": "

                                                                                                                                                                                                                                                                                                                                                                  For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#normalizing-the-feature-set", "title": "Normalizing the feature set", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Use the Normalizer class to transform the feature set to follow a Normal (Gaussian)-like distribution. In general, data must be transformed when using models that assume normality in the residuals. Examples of such models are LogisticRegression, LinearDiscriminantAnalysis and GaussianNB. The class can be accessed from atom through the normalize method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#handling-outliers", "title": "Handling outliers", "text": "

                                                                                                                                                                                                                                                                                                                                                                  When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier samples. The Pruner class offers 7 different strategies to detect outliers (described hereunder). It can be accessed from atom through the prune method.

                                                                                                                                                                                                                                                                                                                                                                  z-score The z-score of a value in the dataset is defined as the number of standard deviations by which the value is above or below the mean of the column. Values above or below a certain threshold (specified with the parameter max_sigma) are considered outliers. Note that, contrary to the rest of the strategies, this approach selects outlier values, not outlier samples! Because of this, it is possible to replace the outlier value instead of dropping the entire sample.

                                                                                                                                                                                                                                                                                                                                                                  Isolation Forest Uses a tree-based anomaly detection algorithm. It is based on modeling the normal data in such a way as to isolate anomalies that are both few and different in the feature space. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Elliptic Envelope If the input variables have a Gaussian distribution, then simple statistical methods can be used to detect outliers. For example, if the dataset has two input variables and both are Gaussian, the feature space forms a multidimensional Gaussian, and knowledge of this distribution can be used to identify values far from the distribution. This approach can be generalized by defining a hypersphere (ellipsoid) that covers the normal data, and data that falls outside this shape is considered an outlier. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Local Outlier Factor A simple approach to identifying outliers is to locate those examples that are far from the other examples in the feature space. This can work well for feature spaces with low dimensionality (few features) but becomes less reliable as the number of features is increased. The local outlier factor is a technique that attempts to harness the idea of nearest neighbors for outlier detection. Each example is assigned a score of how isolated or how likely it is to be outliers based on the size of its local neighborhood. Those examples with the largest score are more likely to be outliers. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  One-class SVM The support vector machine algorithm, initially developed for binary classification tasks, can also be used for one-class classification. When modeling one class, the algorithm captures the density of the majority class and classifies examples on the extremes of the density function as outliers. This modification of SVM is referred to as One-Class SVM. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  DBSCAN The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. Due to this rather generic view, clusters found by DBSCAN can be any shape, as opposed to k-means which assumes that clusters are convex shaped. Samples that lie outside any cluster are considered outliers. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  OPTICS The OPTICS algorithm shares many similarities with the DBSCAN algorithm, and can be considered a generalization of DBSCAN that relaxes the eps requirement from a single value to a value range. The key difference between DBSCAN and OPTICS is that the OPTICS algorithm builds a reachability graph, and a spot within the cluster ordering. These two attributes are assigned when the model is fitted, and are used to determine cluster membership. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_cleaning/#scaling-the-feature-set", "title": "Scaling the feature set", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Standardization of a dataset is a common requirement for many machine learning estimators; they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with zero mean and unit variance). The Scaler class let you quickly scale atom's dataset using one of sklearn's scalers. It can be accessed from atom through the scale method.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/", "title": "Data management", "text": ""}, {"location": "user_guide/data_management/#data-sets", "title": "Data sets", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM is designed to work around one single dataset: the one with which atom is initialized. This is the dataset you want to explore, transform, and use for model training and validation. ATOM differentiates three different data sets:

                                                                                                                                                                                                                                                                                                                                                                  • The training set is usually the largest of the data sets. As the name suggests, this set is used to train the pipeline. During hyperparameter tuning, only the training set is used to fit and evaluate the estimator in every call. The training set in the current branch can be accessed through the train attribute. It's features and target can be accessed through X_train and y_train respectively.
                                                                                                                                                                                                                                                                                                                                                                  • The test set is used to evaluate the models. The model scores on this set give an indication on how the model performs on new data. The test set can be accessed through the test attribute. It's features and target can be accessed through X_test and y_test respectively.
                                                                                                                                                                                                                                                                                                                                                                  • The holdout set is an optional, separate set that should only be used to evaluate the final model's performance. Create this set when you are going to use the test set for an intermediate validation step. The holdout set is immediately set apart during initialization and is not considered part of atom's dataset (the dataset attribute only returns the train and test sets). The holdout set is left untouched until predictions are made on it, i.e., it does not undergo any pipeline transformations until the data set is requested for the first time. The holdout set is stored in atom's holdout attribute. See herean example that shows how to use the holdout data set.

                                                                                                                                                                                                                                                                                                                                                                  The data can be provided in different formats. If the data sets are not specified beforehand, you can input the features and target separately or together:

                                                                                                                                                                                                                                                                                                                                                                  • X
                                                                                                                                                                                                                                                                                                                                                                  • X, y

                                                                                                                                                                                                                                                                                                                                                                  Remember to use the y parameter to indicate the target column in X when using the first option. If not specified, the last column in X is used as target. In both these cases, the size of the sets are defined using the test_size and holdout_size parameters. Note that the splits are made after the subsample of the dataset with the n_rows parameter (when not left to its default value).

                                                                                                                                                                                                                                                                                                                                                                  If you already have the separate data sets, provide them using one of the following formats:

                                                                                                                                                                                                                                                                                                                                                                  • train, test
                                                                                                                                                                                                                                                                                                                                                                  • train, test, holdout
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, y_train, y_test
                                                                                                                                                                                                                                                                                                                                                                  • X_train, X_test, X_holdout, y_train, y_test, y_holdout
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test)
                                                                                                                                                                                                                                                                                                                                                                  • (X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)

                                                                                                                                                                                                                                                                                                                                                                  The input data is always converted internally to a dataframe, if it isn't one already. The column names should always be strings. If they are not, atom changes their type at initialization. If no column names are provided, default names are given of the form X[N-1], where N stands for the n-th feature in the dataset.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#indexing", "title": "Indexing", "text": "

                                                                                                                                                                                                                                                                                                                                                                  By default, atom resets the dataframe's index after initialization and after every transformation in the pipeline. To avoid this, specify the index parameter. If the dataset has an 'identifier' column, it is useful to use it as index for two reasons:

                                                                                                                                                                                                                                                                                                                                                                  • An identifier doesn't usually contain any useful information on the target column, and should therefore be removed before training.
                                                                                                                                                                                                                                                                                                                                                                  • Predictions of specific rows can be accessed through their index.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Avoid duplicate indices in the dataframe. Having them raises an error when initializing atom and may potentially lead to unexpected behavior if introduced later.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#sparse-datasets", "title": "Sparse datasets", "text": "

                                                                                                                                                                                                                                                                                                                                                                  If atom is initialized using a scipy sparse matrix, it is converted internally to a dataframe of sparse columns. Read more about pandas' sparse data structures here. The same conversion takes place when a transformer returns a sparse matrix, like for example, the Vectorizer.

                                                                                                                                                                                                                                                                                                                                                                  Note that ATOM considers a dataset to be sparse if any of the columns is sparse. A dataset can only benefit from sparsity when all its columns are sparse, hence mixing sparse and non-sparse columns is not recommended and can cause estimators to decrease their training speed or even crash. Use the shrink method to convert dense features to sparse and the available_models method to check which models have native support for sparse matrices.

                                                                                                                                                                                                                                                                                                                                                                  Click here to see an example that uses sparse data.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Estimators accelerated using sklearnex don't support sparse datasets.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multioutput-tasks", "title": "Multioutput tasks", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Multioutput is a task where there are more than one target column, i.e., the goal is to predict multiple targets at the same time. When providing a dataframe as target, use the y parameter. Providing y without keyword makes ATOM think you are providing train, test (see the data sets section).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#task-types", "title": "Task types", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM recognizes four multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  Combinations of binary and multiclass target columns are treated as multiclass-multioutput tasks.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multilabel", "title": "Multilabel", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Multilabel is a classification task, labeling each sample with m labels from n_classes possible classes, where m can be 0 to n_classes inclusive. This can be thought of as predicting properties of a sample that are not mutually exclusive.

                                                                                                                                                                                                                                                                                                                                                                  For example, prediction of the topics relevant to a text document. The document may be about one of religion, politics, finance or education, several of the topic classes or all of the topic classes. The target column (atom.y) could look like this:

                                                                                                                                                                                                                                                                                                                                                                  0                        [politics]\n1               [religion, finance]\n2    [politics, finance, education]\n3                                []\n4                         [finance]\n5               [finance, religion]\n6                         [finance]\n7               [religion, finance]\n8                       [education]\n9     [finance, religion, politics]\n\nName: target, dtype: object\n

                                                                                                                                                                                                                                                                                                                                                                  A model can not directly ingest a variable amount of target classes. Use the clean method to assign a binary output to each class, for every sample. Positive classes are indicated with 1 and negative classes with 0. It is thus comparable to running n_classes binary classification tasks. In our example, the target (atom.y) is converted to:

                                                                                                                                                                                                                                                                                                                                                                     education  finance  politics  religion\n0          0        0         1         0\n1          0        1         0         1\n2          1        1         1         0\n3          0        0         0         0\n4          0        1         0         0\n5          0        1         0         1\n6          0        1         0         0\n7          0        1         0         1\n8          1        0         0         0\n9          0        1         1         1\n
                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multiclass-multioutput", "title": "Multiclass-multioutput", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Multiclass-multioutput (also known as multitask classification) is a classification task which labels each sample with a set of non-binary properties. Both the number of properties and the number of classes per property is greater than 2. A single estimator thus handles several joint classification tasks. This is both a generalization of the multilabel classification task, which only considers binary attributes, as well as a generalization of the multiclass classification task, where only one property is considered.

                                                                                                                                                                                                                                                                                                                                                                  For example, classification of the properties \"type of fruit\" and \"colour\" for a set of images of fruit. The property \"type of fruit\" has the possible classes: \"apple\", \"pear\" and \"orange\". The property \"colour\" has the possible classes: \"green\", \"red\", \"yellow\" and \"orange\". Each sample is an image of a fruit, a label is output for both properties and each label is one of the possible classes of the corresponding property.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multioutput-regression", "title": "Multioutput regression", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Multioutput regression predicts multiple numerical properties for each sample. Each property is a numerical variable and the number of properties to be predicted for each sample is >= 2. Some estimators that support multioutput regression are faster than just running n_output estimators.

                                                                                                                                                                                                                                                                                                                                                                  For example, prediction of both wind speed and wind direction, in degrees, using data obtained at a certain location. Each sample would be data obtained at one location and both wind speed and direction would be output for each sample.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#multivariate", "title": "Multivariate", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Multivariate is the multioutput task for forecasting. In this case, we try to forecast more than one time series at the same time.

                                                                                                                                                                                                                                                                                                                                                                  Although all forecasting models in ATOM support multivariate tasks, we differentiate two types of models:

                                                                                                                                                                                                                                                                                                                                                                  • The \"native multivariate\" models apply forecasts where every prediction of endogeneous (y) variables will depend on values of the other target columns.
                                                                                                                                                                                                                                                                                                                                                                  • The rest of the models apply an estimator per column, meaning that forecasts will be made per endogeneous variable, and not be affected by other variables. To access the column-wise estimators, use the estimator's forecasters_ parameter, which stores the fitted forecasters in a dataframe.

                                                                                                                                                                                                                                                                                                                                                                  Read more about time series tasks here.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#native-multioutput-models", "title": "Native multioutput models", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Some models have native support for multioutput tasks. This means that the original estimator is used to make predictions directly on all the target columns. Examples of such models are KNearestNeighbors, RandomForest and ExtraTrees.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#non-native-multioutput-models", "title": "Non-native multioutput models", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The majority of the models don't have integrated support for multioutput tasks. However, it's possible to still use them for such tasks, wrapping them in a meta-estimator capable of handling multiple target columns. For non-native multioutput models, ATOM does so automatically. For multilabel tasks, the meta-estimator is:

                                                                                                                                                                                                                                                                                                                                                                  • ClassifierChain

                                                                                                                                                                                                                                                                                                                                                                  And for multiclass-multioutput and multioutput regression, the meta-estimators are respectively:

                                                                                                                                                                                                                                                                                                                                                                  • MultioutputClassifier
                                                                                                                                                                                                                                                                                                                                                                  • MultioutputRegressor

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Currently, scikit-learn metrics do not support multiclass-multioutput classification tasks. In this case, ATOM calculates the mean of the selected metric over every individual target.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  • Set the native_multilabel or native_multioutput parameter in ATOMModel equal to True to ignore the meta-estimator for custom models.
                                                                                                                                                                                                                                                                                                                                                                  • Check out the multilabel classification and multioutput regression examples.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#branches", "title": "Branches", "text": "

                                                                                                                                                                                                                                                                                                                                                                  You might want to compare how a model performs on a dataset transformed through multiple pipelines, each using different transformers. For example, on one pipeline with an undersampling strategy and the other with an oversampling strategy. To be able to do this, ATOM has a branching system.

                                                                                                                                                                                                                                                                                                                                                                  The branching system helps the user to manage multiple data pipelines within the same atom instance. Branches are created and accessed through atom's branch property. A branch contains a specific pipeline, the dataset transformed through that pipeline, and all data and utility attributes that refer to that dataset. Transformers and models called from atom use the dataset in the current branch, as well as data attributes such as atom.dataset. It's not allowed to change the data in a branch after fitting a model with it. Instead, create a new branch for every unique pipeline.

                                                                                                                                                                                                                                                                                                                                                                  By default, atom starts with one branch called \"main\". To start a new branch, set a new name to the property, e.g., atom.branch = \"undersample\". This creates a new branch from the current one. To create a branch from any other branch type \"_from_\" between the new name and the branch from which to split, e.g., atom.branch = \"oversample_from_main\" creates branch \"oversample\" from branch \"main\", even if the current branch is \"undersample\". To switch between existing branches, just type the name of the desired branch, e.g., atom.branch = \"main\" brings you back to the main branch. Note that every branch contains a unique copy of the whole dataset! Creating many branches can cause memory issues for large datasets.

                                                                                                                                                                                                                                                                                                                                                                  See the Imbalanced datasets or Feature engineering examples for branching use cases.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Always create a new branch if you want to change the dataset after fitting a model! Forcing a data change through the data property's @setter can cause unexpected model behavior and break down the plotting methods.

                                                                                                                                                                                                                                                                                                                                                                  Figure 1. Diagram of a possible branch system to compare an oversampling with an undersampling pipeline.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#memory-considerations", "title": "Memory considerations", "text": "

                                                                                                                                                                                                                                                                                                                                                                  An atom instance stores one copy of the dataset for each branch (this doesn't include the holdout set, which is only stored once), and one copy of the initial dataset with which the instance is initialized. This copy of the original dataset is necessary to avoid data leakage during hyperparameter tuning and for some specific methods like cross_validate and reset. It's created as soon as there are no branches in the initial state (usually after calling the first data transformation). If the dataset is occupying too much memory, consider using the shrink method to convert the dtypes to their smallest possible matching dtype.

                                                                                                                                                                                                                                                                                                                                                                  When working with large datasets and multiple branches, it becomes impossible to store all branches in memory at the same time. To avoid out-of-memory errors, use atom's memory parameter. If not False, atom saves the data of inactive branches as well as the original branch at the specified location (in a directory called joblib, the name of the underlying library managing the caching), maintaining only the current active branch in memory. This mechanism results in a slight drop in performance because of the I/O overhead, but can save a lot of memory. Additionally, the memory's location is also used to cache the output of the fit method of transformers in the pipeline. See here an example using the memory parameter.

                                                                                                                                                                                                                                                                                                                                                                  Apart from the dataset itself, a model's metric scores and shap values are also stored as attributes of the model to avoid having to recalculate them every time they are needed. You can delete all these attributes using the clear method in order to free some memory before saving atom.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#data-transformations", "title": "Data transformations", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Performing data transformations is a common requirement of many datasets before they are ready to be ingested by a model. ATOM provides various classes to apply data cleaning and feature engineering transformations to the data. This tooling should be able to help you apply most of the typically needed transformations to get the data ready for modeling. For further fine-tuning, it's also possible to transform the data using custom transformers (see the add method) or through a function (see the apply method). Remember that all transformations are only applied to the dataset in the current branch.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/data_management/#row-and-column-selection", "title": "Row and column selection", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Many methods in atom contain the rows or columns parameter to select a subset of the dataset. Examples are the evaluate and save_data methods for rows, and the distribution and shrink methods for columns. All data cleaning and feature engineering methods use the columns parameter to apply the transformation only to that selection of columns, and all prediction methods use the rows parameter to make predictions on that selection of rows.

                                                                                                                                                                                                                                                                                                                                                                  As you can see, these two parameters are very important and shared across many methods in atom. Rows and columns can be selected in multiple ways. The check is performed in the order described hereunder:

                                                                                                                                                                                                                                                                                                                                                                  1. By actual dataset, e.g., rows=atom.test is equal to rows=\"test\".
                                                                                                                                                                                                                                                                                                                                                                  2. By range or slice, e.g., rows=range(100) to select the first 100 rows from the dataset or rows=slice(20, 100) to select rows 20 to 99.
                                                                                                                                                                                                                                                                                                                                                                  3. By exact name, e.g., rows=[\"row1\", \"row2\"] to select rows with indices row1 and row2 or columns=[\"col1\", \"col2\"] to select columns col1 and col2. It's also possible to use the + sign to select multiple rows or columns, e.g., columns=\"col1+col2 is the same as columns=[\"col1\", \"col2\"].
                                                                                                                                                                                                                                                                                                                                                                  4. By position, e.g., rows=[0, 1, 2] to select the first three rows.
                                                                                                                                                                                                                                                                                                                                                                  5. By name of the data set (only for rows), e.g., rows=\"train\" to select all rows in the training set, or rows=\"test+holdout\" to select all rows in the test and holdout sets. Valid data sets are dataset, train, test and holdout.
                                                                                                                                                                                                                                                                                                                                                                  6. By dtype (only for columns), e.g., columns=\"number\" to select only numerical columns. See pandas' user guide.
                                                                                                                                                                                                                                                                                                                                                                  7. By regex match, e.g., columns=\"mean_.*\" to select all columns starting with mean_.
                                                                                                                                                                                                                                                                                                                                                                  8. Excluding instead of including using the ! sign, e.g. columns=\"!col1\" to select all columns except col1. You can also exclude multiple rows or columns like this columns=[\"!col1\", \"!col2\"] or this columns=\"!col1+!col2\". It's also possible to exclude data sets for row selection, e.g., columns=\"!train\" or dtypes for column selection, e.g., columns=\"!number\". Note that if a column name starts with !, the selection of that name will take priority over exclusion. Rows and columns can only be included or excluded, and not both at the same time. For example, this selection raises an exception column=[\"col1\", \"!col2\"].

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the rows parameter for every line you want to draw, e.g., atom.plot_roc(rows=(\"train\", \"test\")), or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200}). Note that for these methods, using atom.plot_roc(rows=\"train+test\"), only plots one line with the data from both sets. See the advanced plotting example.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/", "title": "Feature engineering", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't have on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e., they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See the Feature engineering example.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  • All of atom's feature engineering methods automatically adopt the relevant transformer attributes (n_jobs, verbose, logger, random_state) from atom. A different choice can be added as parameter to the method call, e.g., atom.feature_selection(\"pca\", n_features=10, random_state=2).
                                                                                                                                                                                                                                                                                                                                                                  • Like the add method, the feature engineering methods accept the columns parameter to only transform a subset of the dataset's features, e.g., atom.feature_selection(\"pca\",n_features=10, columns=slice(5, 15)). Read more in the row and column selection section.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#extracting-datetime-features", "title": "Extracting datetime features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Features that contain dates or timestamps can not be directly ingested by models since they are not strictly numerical. Encoding them as categorical features is not an option since the encoding does not capture the relationship between the different moments in time. The FeatureExtractor class creates new features extracting datetime elements (e.g., day, month, year, hour...) from the columns. It can be accessed from atom through the feature_extraction method. The new features are named equally to the column from which they are extracted, followed by an underscore and the datetime element they create, e.g., x0_day for the day element of x0.

                                                                                                                                                                                                                                                                                                                                                                  Note that many time features have a cyclic pattern, e.g., after Sunday comes Monday. This means that if we would encode the days of the week from 0 to 6, we would lose that relation. A common method used to encode cyclical features is to transform the data into two dimensions using a sine and cosine transformation:

                                                                                                                                                                                                                                                                                                                                                                  \\[ x_{sin} = sin\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] \\[ x_{cos} = cos\\left(\\frac{2\\pi * x}{max(x)}\\right) \\]

                                                                                                                                                                                                                                                                                                                                                                  The resulting features have their names followed by sin or cos, e.g. x0_day_sin and x0_day_cos. The datetime elements that can be encoded in a cyclic fashion are: microsecond, second, minute, hour, weekday, day, day_of_year, month and quarter. Note that decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos values are expected to be considered as one single coordinate system.

                                                                                                                                                                                                                                                                                                                                                                  Use the fmt parameter to specify your feature's format in case the column is categorical. The FeatureExtractor class will convert the column to the datetime dtype before extracting the specified features. Click here for an overview of the available formats.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#generating-new-features", "title": "Generating new features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation.

                                                                                                                                                                                                                                                                                                                                                                  Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is \"log\", it will create the new feature LOG(old_feature) and if the operator is \"mul\", it will create the new feature old_feature_1 x old_feature_2. The operators can be chosen through the operators parameter. Choose from:

                                                                                                                                                                                                                                                                                                                                                                  • add: Take the sum of two features.
                                                                                                                                                                                                                                                                                                                                                                  • sub: Subtract two features from each other.
                                                                                                                                                                                                                                                                                                                                                                  • mul: Multiply two features with each other.
                                                                                                                                                                                                                                                                                                                                                                  • div: Divide two features with each other.
                                                                                                                                                                                                                                                                                                                                                                  • abs: Calculate the absolute value of a feature.
                                                                                                                                                                                                                                                                                                                                                                  • srqt: Calculate the square root of a feature.
                                                                                                                                                                                                                                                                                                                                                                  • log: Calculate the natural logarithm of a feature.
                                                                                                                                                                                                                                                                                                                                                                  • sin: Calculate the sine of a feature.
                                                                                                                                                                                                                                                                                                                                                                  • cos: Calculate the cosine of a feature.
                                                                                                                                                                                                                                                                                                                                                                  • tan: Calculate the tangent of a feature.

                                                                                                                                                                                                                                                                                                                                                                  ATOM's implementation of DFS uses the featuretools package.

                                                                                                                                                                                                                                                                                                                                                                  Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming, a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where dfs can be seen as some kind of \"brute force\" for feature engineering, gfg tries to improve its features with every generation of the algorithm. gfg uses the same operators as dfs, but instead of only applying the transformations once, it evolves them further, creating nested structures of combinations of features. The new features are given the name feature_n, where n stands for the n-th feature in the dataset. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute.

                                                                                                                                                                                                                                                                                                                                                                  ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#grouping-similar-features", "title": "Grouping similar features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  When your dataset contains many similar features corresponding to a certain natural group or entity, it's possible to replace these features for a handful of them, that should capture the relations of the group, in order to lose as little information as possible. To achieve this, the FeatureGrouper class computes certain statistical properties that describe the group's distribution, like the mean or the median, and replaces the columns with the result of these statistical calculations over every row in the dataset. The goal of this approach is to reduce the number of columns in the dataset, avoiding the curse of dimensionality.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#selecting-useful-features", "title": "Selecting useful features", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#standard-strategies", "title": "Standard strategies", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Principal Components Analysis Applying PCA reduces the dimensionality of the dataset by maximizing the variance of each dimension. The new features are called pca0, pca1, etc... PCA can be applied in three ways:

                                                                                                                                                                                                                                                                                                                                                                  • If the data is dense (i.e., not sparse), the estimator used is PCA. Before fitting the transformer, the data is scaled to mean=0 and std=1 if it wasn't already. Read more in sklearn's documentation.
                                                                                                                                                                                                                                                                                                                                                                  • If the data is [sparse][sparse datasets] (often the case for term-document matrices, see Vectorizer), the estimator used is TruncatedSVD. Read more in sklearn's documentation.
                                                                                                                                                                                                                                                                                                                                                                  • If engine is \"sklearnex\" or \"cuml\", the estimator used is the package's PCA implementation. Sparse data is not supported for neither engine.

                                                                                                                                                                                                                                                                                                                                                                  Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its predefined models, e.g., solver=\"RF\". If you didn't call the FeatureSelector through atom, don't forget to indicate the estimator's task adding _class or _reg after the name, e.g., RF_class to use a random forest classifier. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Sequential Feature Selection Sequential feature selection adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Recursive Feature Elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features, and the importance of each feature is obtained either through a coef_ or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow.

                                                                                                                                                                                                                                                                                                                                                                  RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV) to assess every step's performance. Also, where RFE returns the number of features selected by n_features, RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#advanced-strategies", "title": "Advanced strategies", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The following strategies are a collection of nature-inspired optimization algorithms that maximize an objective function. If not manually specified, the function calculates the cross-validated score of a model on the data. Use the scoring parameter (not present in description, part of kwargs) to specify the metric to optimize on.

                                                                                                                                                                                                                                                                                                                                                                  Particle Swarm Optimization Particle Swarm Optimization (PSO) optimizes a problem by having a population of candidate solutions (particles), and moving them around in the search-space according to simple mathematical formula over the particle's position and velocity. Each particle's movement is influenced by its local best known position, but is also guided toward the best known positions in the search-space, which are updated as better positions are found by other particles. This is expected to move the swarm toward the best solutions. Read more here.

                                                                                                                                                                                                                                                                                                                                                                  Harris Hawks Optimization Harris Hawks Optimization (HHO) mimics the action and reaction of Hawk's team collaboration hunting in nature and prey escaping to discover the solutions of the single-objective problem. Read more here.

                                                                                                                                                                                                                                                                                                                                                                  Grey Wolf Optimization The Grey Wolf Optimizer (GWO) mimics the leadership hierarchy and hunting mechanism of grey wolves in nature. Four types of grey wolves such as alpha, beta, delta, and omega are employed for simulating the leadership hierarchy. In addition, three main steps of hunting, searching for prey, encircling prey, and attacking prey, are implemented to perform optimization. Read more here.

                                                                                                                                                                                                                                                                                                                                                                  Dragonfly Optimization The Dragonfly Algorithm (DFO) algorithm originates from static and dynamic swarming behaviours. These two swarming behaviours are very similar to the two main phases of optimization using meta-heuristics: exploration and exploitation. Dragonflies create sub swarms and fly over different areas in a static swarm, which is the main objective of the exploration phase. In the static swarm, however, dragonflies fly in bigger swarms and along one direction, which is favourable in the exploitation phase. Read more here.

                                                                                                                                                                                                                                                                                                                                                                  Genetic Optimization Genetic Optimization is a metaheuristic inspired by the process of natural selection that belongs to the larger class of evolutionary algorithms. Genetic algorithms are commonly used to generate high-quality solutions to optimization and search problems by relying on biologically inspired operators such as mutation, crossover and selection. Read more here.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/feature_engineering/#other-selection-methods", "title": "Other selection methods", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Removing features with low or high variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model can't learn much from them. In a similar way, features with very high variance have very few values repeated, which makes it also difficult for a model to learn from this feature.

                                                                                                                                                                                                                                                                                                                                                                  FeatureSelector removes a categorical feature when the maximum number of occurrences for any value is below min_repeated or when the same value is repeated in at least max_repeated fraction of the rows. The default option is to remove a feature if all values in it are either different or exactly the same.

                                                                                                                                                                                                                                                                                                                                                                  Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e., two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/introduction/", "title": "Introduction", "text": "

                                                                                                                                                                                                                                                                                                                                                                  There is no magic formula in data science that can tell us which type of machine learning estimator in combination with which pipeline will perform best for a given raw dataset. Different models are better suited for different types of data and different types of problems. You can follow some rough guide on how to approach problems with regard to which model to try, but these are incomplete at best.

                                                                                                                                                                                                                                                                                                                                                                  During the exploration phase of a machine learning project, a data scientist tries to find the optimal pipeline for his specific use case. This usually involves applying standard data cleaning steps, creating or selecting useful features, trying out different models, etc. Testing multiple pipelines requires many lines of code, and writing it all in the same notebook often makes it long and cluttered. On the other hand, using multiple notebooks makes it harder to compare the results and to keep an overview. On top of that, refactoring the code for every test can be quite time-consuming. How many times have you conducted the same action to pre-process a raw dataset? How many times have you copy-and-pasted code from an old repository to re-use it in a new use case?

                                                                                                                                                                                                                                                                                                                                                                  Although best practices tell us to start with a simple model and build up to more complicated ones, many data scientists just use the model best known to them in order to avoid the aforementioned problems. This can result in poor performance (because the model is just not the right one for the task) or in inefficient management of time and computing resources (because a simpler/faster model could have achieved a similar performance).

                                                                                                                                                                                                                                                                                                                                                                  ATOM is here to help solve these common issues. The package acts as a wrapper of the whole machine learning pipeline, helping the data scientist to rapidly find a good model for his problem. Avoid endless imports and documentation lookups. Avoid rewriting the same code over and over again. With just a few lines of code, it's now possible to perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset, providing quick insights on which pipeline performs best for the task at hand.

                                                                                                                                                                                                                                                                                                                                                                  It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you determine the right pipeline, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance.

                                                                                                                                                                                                                                                                                                                                                                  Example steps taken by ATOM's pipeline:

                                                                                                                                                                                                                                                                                                                                                                  1. Data Cleaning
                                                                                                                                                                                                                                                                                                                                                                    • Handle missing values
                                                                                                                                                                                                                                                                                                                                                                    • Encode categorical features
                                                                                                                                                                                                                                                                                                                                                                    • Detect and remove outliers
                                                                                                                                                                                                                                                                                                                                                                    • Balance the training set
                                                                                                                                                                                                                                                                                                                                                                  2. Feature engineering
                                                                                                                                                                                                                                                                                                                                                                    • Create new non-linear features
                                                                                                                                                                                                                                                                                                                                                                    • Select the most promising features
                                                                                                                                                                                                                                                                                                                                                                  3. Train and validate multiple models
                                                                                                                                                                                                                                                                                                                                                                    • Apply hyperparameter tuning
                                                                                                                                                                                                                                                                                                                                                                    • Fit the models on the training set
                                                                                                                                                                                                                                                                                                                                                                    • Evaluate the results on the test set
                                                                                                                                                                                                                                                                                                                                                                  4. Analyze the results
                                                                                                                                                                                                                                                                                                                                                                    • Get the scores on various metrics
                                                                                                                                                                                                                                                                                                                                                                    • Make plots to compare the model performances

                                                                                                                                                                                                                                                                                                                                                                  Figure 1. Diagram of a possible pipeline created by ATOM."}, {"location": "user_guide/logging/", "title": "Logging & Tracking", "text": ""}, {"location": "user_guide/logging/#logging", "title": "Logging", "text": "

                                                                                                                                                                                                                                                                                                                                                                  To start logging your experiments, fill the logger parameter with the name or path to store the logging file. If automatic naming is used, the file is saved using the __name__ of the class followed by the timestamp of the logger's creation, e.g. ATOMClassifier_11May21_20h11m03s. The logging file contains method calls, all printed messages to stdout with maximum verbosity, and any exception raised during running. Additionally, the logging entries of external libraries are redirected to the same file handler.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/logging/#tracking", "title": "Tracking", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM uses MLflow Tracking as a backend API and UI for logging models, parameters, pipelines, data and plots. Start tracking your experiments assigning a name to the experiment parameter. Every model is tracked using a separate run. When no backend is configured, the data is stored locally at ./mlruns. To configure the backend, use mlflow.set_tracking_uri in your notebook or IDE before initializing atom. This does not affect the currently active run (if one exists), but takes effect for successive runs. Run mlflow ui on your terminal to open MLflow's Tracking UI and view it at http://localhost:5000.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  When using ATOM on Databricks, the experiment's name should include the complete path to the storage, e.g., /Users/username@domain.com/experiment_name.

                                                                                                                                                                                                                                                                                                                                                                  Example

                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"breast_cancer\")\natom.run(models=[\"LR\", \"RF\", \"LGB\"], n_trials=(0, 0, 10))\n

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/logging/#dagshub-integration", "title": "DAGsHub integration", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM has a build-in integration with DAGsHub, a web platform based on open source tools, optimized for data science and oriented towards the open source community. To store your mlflow experiments in a DAGsHub repo, type dagshub:<experiment_name> in the experiment parameter (instead of just the experiment's name). If the repo does not already exist, a new public repo is created.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  If you are logged into your DAGsHub account when initializing atom with a dagshub experiment, a page on your web browser is automatically opened to give access permissions. If not, read here how to set up your DAGsHub credentials.

                                                                                                                                                                                                                                                                                                                                                                  Example

                                                                                                                                                                                                                                                                                                                                                                  from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"dagshub:breast_cancer\")\natom.run(models=[\"LR\", \"RF\"])\n

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/logging/#tracked-elements", "title": "Tracked elements", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Tags The runs are automatically tagged with the model's full name, the branch from which the model was trained, and the time it took to fit the model. Add additional custom tags through the ht_params parameter, e.g., atom.run([\"LR\", \"RF\"], ht_params={\"tags\": {\"tag1\": 1}}).

                                                                                                                                                                                                                                                                                                                                                                  Parameters All parameters used by the estimator at initialization are tracked. Additional parameters passed to the fit method are not tracked.

                                                                                                                                                                                                                                                                                                                                                                  Model The model's estimator is stored as artifact. The estimator has to be compatible with the mlflow.sklearn, module.

                                                                                                                                                                                                                                                                                                                                                                  Hyperparameter tuning If hyperparameter tuning is performed, every trial is tracked as a nested run in the model's main run. This option can be switched off using atom's log_ht attribute, e.g., atom.log_ht = False. The data and pipeline options are never stored within nested runs.

                                                                                                                                                                                                                                                                                                                                                                  Metrics All metric results are tracked, not only during training, but also when the evaluate method is called at a later point. Metrics calculated during in-training validation are also stored.

                                                                                                                                                                                                                                                                                                                                                                  Dataset The train and test sets used to fit and evaluate the model can be stored as .csv files to the run's artifacts. This option can be switched on using atom's log_data attribute, e.g. atom.log_data = True.

                                                                                                                                                                                                                                                                                                                                                                  Pipeline The model's pipeline (returned from the export_pipeline method) can be stored as an artifact. This option can be switched on using atom's log_pipeline attribute, e.g., atom.log_pipeline = True.

                                                                                                                                                                                                                                                                                                                                                                  Plots By default, plots are stored as .html artifacts in all runs corresponding to the models that are showed in the plot. If the filename parameter is specified, they are stored under that name, else the method's name is used. This option can be switched off using atom's log_plots attribute, e.g., atom.log_plots = False.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/", "title": "Models", "text": ""}, {"location": "user_guide/models/#predefined-models", "title": "Predefined models", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM provides many models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, a class containing the underlying estimator is attached to atom as an attribute. We refer to these \"subclasses\" as models. Apart from the estimator, the models contain a variety of attributes and methods that can help you understand how the underlying estimator performed. They can be accessed using their acronyms, e.g., atom.LGB to access the LightGBM model. The available models and their corresponding acronyms are:

                                                                                                                                                                                                                                                                                                                                                                  • AdaBoost (AdaB)
                                                                                                                                                                                                                                                                                                                                                                  • ARIMA (Arima)
                                                                                                                                                                                                                                                                                                                                                                  • AutoARIMA (AutoARIMA)
                                                                                                                                                                                                                                                                                                                                                                  • AutomaticRelevanceDetermination (ARD)
                                                                                                                                                                                                                                                                                                                                                                  • Bagging (Bag)
                                                                                                                                                                                                                                                                                                                                                                  • BayesianRidge (BR)
                                                                                                                                                                                                                                                                                                                                                                  • BernoulliNB (BNB)
                                                                                                                                                                                                                                                                                                                                                                  • CatBoost (CatB)
                                                                                                                                                                                                                                                                                                                                                                  • CategoricalNB (CatNB)
                                                                                                                                                                                                                                                                                                                                                                  • ComplementNB (CNB)
                                                                                                                                                                                                                                                                                                                                                                  • DecisionTree (Tree)
                                                                                                                                                                                                                                                                                                                                                                  • Dummy (Dummy)
                                                                                                                                                                                                                                                                                                                                                                  • ElasticNet (EN)
                                                                                                                                                                                                                                                                                                                                                                  • ETS (ETS)
                                                                                                                                                                                                                                                                                                                                                                  • ExponentialSmoothing (ES)
                                                                                                                                                                                                                                                                                                                                                                  • ExtraTree (ETree)
                                                                                                                                                                                                                                                                                                                                                                  • ExtraTrees (ET)
                                                                                                                                                                                                                                                                                                                                                                  • GaussianNB (GNB)
                                                                                                                                                                                                                                                                                                                                                                  • GaussianProcess (GP)
                                                                                                                                                                                                                                                                                                                                                                  • GradientBoostingMachine (GBM)
                                                                                                                                                                                                                                                                                                                                                                  • HuberRegression (Huber)
                                                                                                                                                                                                                                                                                                                                                                  • HistGradientBoosting (hGBM)
                                                                                                                                                                                                                                                                                                                                                                  • KNearestNeighbors (KNN)
                                                                                                                                                                                                                                                                                                                                                                  • Lasso (Lasso)
                                                                                                                                                                                                                                                                                                                                                                  • LeastAngleRegression (Lars)
                                                                                                                                                                                                                                                                                                                                                                  • LightGBM (LGB)
                                                                                                                                                                                                                                                                                                                                                                  • LinearDiscriminantAnalysis (LDA)
                                                                                                                                                                                                                                                                                                                                                                  • LinearSVM (lSVM)
                                                                                                                                                                                                                                                                                                                                                                  • LogisticRegression (LR)
                                                                                                                                                                                                                                                                                                                                                                  • MultiLayerPerceptron (MLP)
                                                                                                                                                                                                                                                                                                                                                                  • MultinomialNB (MNB)
                                                                                                                                                                                                                                                                                                                                                                  • NaiveForecaster (NF)
                                                                                                                                                                                                                                                                                                                                                                  • OrdinaryLeastSquares (OLS)
                                                                                                                                                                                                                                                                                                                                                                  • OrthogonalMatchingPursuit (OMP)
                                                                                                                                                                                                                                                                                                                                                                  • PassiveAggressive (PA)
                                                                                                                                                                                                                                                                                                                                                                  • Perceptron (Perc)
                                                                                                                                                                                                                                                                                                                                                                  • PolynomialTrend (PT)
                                                                                                                                                                                                                                                                                                                                                                  • QuadraticDiscriminantAnalysis (QDA)
                                                                                                                                                                                                                                                                                                                                                                  • RadiusNearestNeighbors (RNN)
                                                                                                                                                                                                                                                                                                                                                                  • RandomForest (RF)
                                                                                                                                                                                                                                                                                                                                                                  • Ridge (Ridge)
                                                                                                                                                                                                                                                                                                                                                                  • StochasticGradientDescent (SGD)
                                                                                                                                                                                                                                                                                                                                                                  • SupportVectorMachine (SVM)
                                                                                                                                                                                                                                                                                                                                                                  • XGBoost (XGB)

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  The model classes can not be initialized directly by the user! Use them only through atom.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  The acronyms are case-insensitive, e.g., atom.lgb also calls the LightGBM model.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/#custom-models", "title": "Custom models", "text": "

                                                                                                                                                                                                                                                                                                                                                                  It is also possible to create your own models in ATOM's pipeline. For example, imagine we want to use sklearn's RANSACRegressor estimator (note that is not included in ATOM's predefined models). There are two ways to achieve this:

                                                                                                                                                                                                                                                                                                                                                                  • Using ATOMModel (recommended). With this approach you can pass the required model characteristics to the pipeline.
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor, ATOMModel\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> ransac = ATOMModel(RANSACRegressor, name=\"RANSAC\", needs_scaling=True)\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run(ransac)\n
                                                                                                                                                                                                                                                                                                                                                                  • Using the estimator's class or an instance of the class. This approach will also call ATOMModel under the hood, but it will leave its parameters to their default values.
                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMRegressor\n>>> from sklearn.datasets import load_diabetes\n>>> from sklearn.linear_model import RANSACRegressor\n\n>>> X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n>>> atom = ATOMRegressor(X, y)\n>>> atom.run(RANSACRegressor)\n

                                                                                                                                                                                                                                                                                                                                                                  Additional things to take into account:

                                                                                                                                                                                                                                                                                                                                                                  • Custom models can be accessed through their acronym like any other model, e.g. atom.ransac in the example above.
                                                                                                                                                                                                                                                                                                                                                                  • Custom models are not restricted to sklearn estimators, but they should follow sklearn's API, i.e., have a fit and predict method.
                                                                                                                                                                                                                                                                                                                                                                  • Parameter customization (for the initializer) is only possible for custom models which provide an estimator that has a set_params() method, i.e., it's a child class of BaseEstimator.
                                                                                                                                                                                                                                                                                                                                                                  • Hyperparameter tuning for custom models is ignored unless appropriate dimensions are provided through ht_params.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/#deep-learning", "title": "Deep learning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Deep learning models can be used through ATOM's custom models as long as they follow sklearn's API. For example, models implemented with the Keras package should use the scikeras wrappers KerasClassifier or KerasRegressor.

                                                                                                                                                                                                                                                                                                                                                                  Many deep learning use cases, for example in computer vision, use datasets with more than 2 dimensions, e.g., image data can have shape (n_samples, length, width, rgb). Luckily, scikeras has a workaround to be able to work with such datasets. Learn with this example how to use ATOM to train and validate a Convolutional Neural Network on an image dataset.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Models implemented with keras can only use custom hyperparameter tuning when n_jobs=1 or ht_params={\"cv\": 1}. Using n_jobs > 1 and cv > 1 raises a PicklingError due to incompatibilities of the APIs.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/#ensembles", "title": "Ensembles", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Ensemble models use multiple estimators to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone. ATOM implements two ensemble techniques: voting and stacking. Click here to see an example that uses ensemble models.

                                                                                                                                                                                                                                                                                                                                                                  If the ensemble's underlying estimator is a model that used automated feature scaling, it's added as a Pipeline containing the scaler and estimator. If a mlflow experiment is active, the ensembles start their own run, just like the predefined models do.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/#voting", "title": "Voting", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The idea behind voting is to combine the predictions of conceptually different models to make new predictions. Such a technique can be useful for a set of equally well performing models in order to balance out their individual weaknesses. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  A voting model is created from a trainer through the voting method. The voting model is added automatically to the list of models in the trainer, under the Vote acronym. The underlying estimator is a custom adaptation of VotingClassifier or VotingRegressor depending on the task. The differences between ATOM's and sklearn's implementation are:

                                                                                                                                                                                                                                                                                                                                                                  • ATOM's implementation doesn't fit estimators if they're already fitted.
                                                                                                                                                                                                                                                                                                                                                                  • ATOM's instance is considered fitted at initialization when all underlying estimators are.
                                                                                                                                                                                                                                                                                                                                                                  • ATOM's VotingClassifier doesn't implement a LabelEncoder to encode the target column.

                                                                                                                                                                                                                                                                                                                                                                  The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. As a consequence of this, the VotingClassifier can not use sklearn's build-in LabelEncoder for the target column since it can't be fitted when initializing the class. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/models/#stacking", "title": "Stacking", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Stacking is a method for combining estimators to reduce their biases. More precisely, the predictions of each individual estimator are stacked together and used as input to a final estimator to compute the prediction. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  A stacking model is created from a trainer through the stacking method. The stacking model is added automatically to the list of models in the trainer, under the Stack acronym. The underlying estimator is a custom adaptation of StackingClassifier or StackingRegressor depending on the task. The only difference between ATOM's and sklearn's implementation is that ATOM's implementation doesn't fit estimators if they're already fitted. The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nlp/", "title": "Natural Language Processing", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Natural Language Processing (NLP) is the subfield of machine learning that works with human language data. The nlp module contains four classes that help to convert raw text to meaningful numeric values, ready to be ingested by a model. ATOM uses the nltk library for the majority of its NLP processes.

                                                                                                                                                                                                                                                                                                                                                                  The text documents are expected to be provided in a column of the dataframe named corpus (the name is case-insensitive). Only the corpus is changed by the transformers, leaving the rest of the columns as is. This mechanism allows atom to combine datasets containing a text corpus with other non-text features. If an array is provided as input, it should consist of only one feature containing the text (one document per row). ATOM will then automatically convert the array to a dataframe with the desired column name. Documents are expected to be strings or sequences of words. Click here for an example using text data.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  All of atom's NLP methods automatically adopt the relevant transformer attributes (verbose, logger) from atom. A different choice can be added as parameter to the method call, e.g., atom.tokenize(verbose=0).

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  ATOM doesn't do topic modeling! The module's goal is to help process text documents into features that can be used for supervised learning.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#text-cleaning", "title": "Text cleaning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Text data is rarely clean. Whether it's scraped from a website or inferred from paper documents, it's always populated with irrelevant information for the model, such as email addresses, HTML tags, numbers or punctuation marks. Use the TextCleaner class to clean the corpus from such noise. It can be accessed from atom through the textclean method. Use the class' parameters to choose which transformations to perform. The available steps are:

                                                                                                                                                                                                                                                                                                                                                                  • Decode unicode characters to their ascii representations.
                                                                                                                                                                                                                                                                                                                                                                  • Convert all characters to lower case.
                                                                                                                                                                                                                                                                                                                                                                  • Drop email addresses from the text.
                                                                                                                                                                                                                                                                                                                                                                  • Drop URL links from the text.
                                                                                                                                                                                                                                                                                                                                                                  • Drop HTML tags from the text.
                                                                                                                                                                                                                                                                                                                                                                  • Drop emojis from the text.
                                                                                                                                                                                                                                                                                                                                                                  • Drop numbers from the text.
                                                                                                                                                                                                                                                                                                                                                                  • Drop punctuations from the text.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#tokenization", "title": "Tokenization", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Some text processing algorithms, like stemming or lemmatization, require the corpus to be made out of tokens, instead of strings, in order to know what to consider as words. Tokenization is used to achieve this. It separates every document into a sequence of smaller units. In this case, the words.

                                                                                                                                                                                                                                                                                                                                                                  Sometimes, words have a different meaning on their own than when combined with adjacent words. For example, the word new has a completely different meaning when the word york is directly after it than when it's not. These combinations of two words are called bigrams. When there are three words, they are called trigrams, and with four words quadgrams.

                                                                                                                                                                                                                                                                                                                                                                  The Tokenizer class converts a document into a sequence of words, and can create the most frequent bigrams, trigrams and quadgrams. It can be accessed from atom through the tokenize method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#text-normalization", "title": "Text Normalization", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Normalization for texts is a process that converts a list of words to a more uniform standard. This is useful to reduce the amount of different information that the computer has to deal with, and therefore improves efficiency. The goal of normalization techniques like stemming and lemmatization is to reduce inflectional and related forms of a word to a common base form.

                                                                                                                                                                                                                                                                                                                                                                  Normalize the words in the corpus using the TextNormalizer class. It can be accessed from atom through the textnormalize method.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nlp/#vectorization", "title": "Vectorization", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Text data cannot be fed directly to the algorithms themselves, as most of them expect numerical feature vectors with a fixed size, rather than words in the text documents with variable length. Vectorization is the general process of turning a collection of text documents into numerical feature vectors. You can apply it to the corpus using the Vectorizer class. It can be accessed from atom through the vectorize method.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.

                                                                                                                                                                                                                                                                                                                                                                  Bag of Words The Bag of Words (BOW) strategy applies tokenization, counting and normalization to the corpus. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document. The created columns are named with the words they are embedding with the prefix corpus_. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  TF-IDF In a large text corpus, some words will be very present (e.g., \u201cthe\u201d, \u201ca\u201d, \u201cis\u201d in English), hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier, those very frequent terms would shadow the frequencies of rarer, yet more interesting, terms. Use the TF-IDF strategy to re-weight the count features into floating point values. The created columns are named with the words they are embedding with the prefix corpus_. Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  Hashing The larger the corpus, the larger the vocabulary will grow and thus increasing the number of features and memory use. Use the Hashing strategy to hash the words to a specified number of features. The created features are named hash0, hash1, etc... Read more in sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/nomenclature/", "title": "Nomenclature", "text": "

                                                                                                                                                                                                                                                                                                                                                                  This documentation consistently uses terms to refer to certain concepts related to this package. The most frequent terms are described hereunder.

                                                                                                                                                                                                                                                                                                                                                                  ATOM

                                                                                                                                                                                                                                                                                                                                                                  Refers to this package.

                                                                                                                                                                                                                                                                                                                                                                  atom

                                                                                                                                                                                                                                                                                                                                                                  Instance of the ATOMClassifier, ATOMForecaster or ATOMRegressor classes (note that the examples use it as the default variable name).

                                                                                                                                                                                                                                                                                                                                                                  A pipeline, corresponding dataset and models fitted to that dataset. See the branches section of the user guide.

                                                                                                                                                                                                                                                                                                                                                                  categorical columns

                                                                                                                                                                                                                                                                                                                                                                  Refers to all columns of type object or category.

                                                                                                                                                                                                                                                                                                                                                                  class

                                                                                                                                                                                                                                                                                                                                                                  Unique value in a column, e.g., a binary classifier has 2 classes in the target column.

                                                                                                                                                                                                                                                                                                                                                                  dataframe

                                                                                                                                                                                                                                                                                                                                                                  Two-dimensional, size-mutable, potentially heterogeneous tabular data of type pd.DataFrame or its modin counterpart.

                                                                                                                                                                                                                                                                                                                                                                  dataframe-like

                                                                                                                                                                                                                                                                                                                                                                  Any type object from which a dataframe can be created. This includes an iterable, a dict whose values are 1d-arrays, a two-dimensional list, tuple, np.ndarray or sps.csr_matrix, and most commonly, a dataframe. This is the standard input format for any dataset.

                                                                                                                                                                                                                                                                                                                                                                  Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and you are performing parallel operations, since it can avoid broadcasting a large dataset from the driver to the workers.

                                                                                                                                                                                                                                                                                                                                                                  estimator

                                                                                                                                                                                                                                                                                                                                                                  An object which manages the estimation and decoding of an algorithm. The algorithm is estimated as a deterministic function of a set of parameters, a dataset and a random state. Should implement a fit method. Often used interchangeably with predictor because of user preference.

                                                                                                                                                                                                                                                                                                                                                                  index

                                                                                                                                                                                                                                                                                                                                                                  Immutable sequence used for indexing and alignment of type pd.Index or their modin counterpart.

                                                                                                                                                                                                                                                                                                                                                                  missing values

                                                                                                                                                                                                                                                                                                                                                                  All values in the missing attribute, as well as None, NaN, +inf and -inf.

                                                                                                                                                                                                                                                                                                                                                                  model

                                                                                                                                                                                                                                                                                                                                                                  Instance of a model in atom. Not to confuse with estimator.

                                                                                                                                                                                                                                                                                                                                                                  outliers

                                                                                                                                                                                                                                                                                                                                                                  Sample that contains one or more outlier values. Note that the Pruner class can use a different definition for outliers depending on the chosen strategy.

                                                                                                                                                                                                                                                                                                                                                                  outlier value

                                                                                                                                                                                                                                                                                                                                                                  Value that lies further than 3 times the standard deviation away from the mean of its column, i.e., |z-score| > 3.

                                                                                                                                                                                                                                                                                                                                                                  predictor

                                                                                                                                                                                                                                                                                                                                                                  An estimator implementing a predict method.

                                                                                                                                                                                                                                                                                                                                                                  scorer

                                                                                                                                                                                                                                                                                                                                                                  A non-estimator callable object which evaluates an estimator on given test data, returning a number. Unlike evaluation metrics, a greater returned number must correspond with a better score. See sklearn's documentation.

                                                                                                                                                                                                                                                                                                                                                                  segment

                                                                                                                                                                                                                                                                                                                                                                  Subset (segment) of a sequence, whether through slicing or generating a range of values. When given as a parameter type, it includes both range and slice.

                                                                                                                                                                                                                                                                                                                                                                  sequence

                                                                                                                                                                                                                                                                                                                                                                  A one-dimensional, indexable array of type sequence (except string), np.ndarray, index or series. This is the standard input format for a dataset's target column.

                                                                                                                                                                                                                                                                                                                                                                  series

                                                                                                                                                                                                                                                                                                                                                                  One-dimensional ndarray with axis labels of type pd.Series or its modin counterpart.

                                                                                                                                                                                                                                                                                                                                                                  target

                                                                                                                                                                                                                                                                                                                                                                  The dependent variable in a supervised learning task. Passed as y to an estimator's fit method.

                                                                                                                                                                                                                                                                                                                                                                  task

                                                                                                                                                                                                                                                                                                                                                                  One of the supervised machine learning approaches that ATOM supports:

                                                                                                                                                                                                                                                                                                                                                                  • binary classification
                                                                                                                                                                                                                                                                                                                                                                  • multiclass classification
                                                                                                                                                                                                                                                                                                                                                                  • multilabel classification
                                                                                                                                                                                                                                                                                                                                                                  • multiclass-multioutput classification
                                                                                                                                                                                                                                                                                                                                                                  • regression
                                                                                                                                                                                                                                                                                                                                                                  • multioutput regression
                                                                                                                                                                                                                                                                                                                                                                  • univariate forecast
                                                                                                                                                                                                                                                                                                                                                                  • multivariate forecast
                                                                                                                                                                                                                                                                                                                                                                  transformer

                                                                                                                                                                                                                                                                                                                                                                  An estimator implementing a transform method. This encompasses all data cleaning and feature engineering classes.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/", "title": "Plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM provides many plotting methods to analyze the data or compare the model performances. Descriptions and examples can be found in the API section. ATOM mainly uses the plotly library for plotting. Plotly makes interactive, publication-quality graphs that are rendered using html. Some plots require other libraries like matplotlib, shap, wordcloud and schemdraw.

                                                                                                                                                                                                                                                                                                                                                                  Plots that compare model performances (methods with the models parameter) can be called directly from atom, e.g., atom.plot_roc(), or from one of the models, e.g., atom.adab.plot_roc(). If called from atom, use the models parameter to specify which models to plot. If called from a specific model, it makes the plot only for that model and the models parameter becomes unavailable.

                                                                                                                                                                                                                                                                                                                                                                  Plots that analyze the data (methods without the models parameter) can only be called from atom, and not from the models.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#parameters", "title": "Parameters", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Apart from the plot-specific parameters, all plots have five parameters in common:

                                                                                                                                                                                                                                                                                                                                                                  • The title parameter adds a title to the plot. The default value doesn't show any title. Provide a configuration (as dictionary) to customize its appearance, e.g., title=dict(text=\"Awesome plot\", color=\"red\"). Read more in plotly's documentation.
                                                                                                                                                                                                                                                                                                                                                                  • The legend parameter is used to show/hide, position or customize the plot's legend. Provide a configuration (as dictionary) to customize its appearance (e.g., legend=dict(title=\"Title for legend\", title_font_color=\"red\")) or choose one of the following locations:

                                                                                                                                                                                                                                                                                                                                                                    • upper left
                                                                                                                                                                                                                                                                                                                                                                    • upper right
                                                                                                                                                                                                                                                                                                                                                                    • lower left
                                                                                                                                                                                                                                                                                                                                                                    • lower right
                                                                                                                                                                                                                                                                                                                                                                    • upper center
                                                                                                                                                                                                                                                                                                                                                                    • lower center
                                                                                                                                                                                                                                                                                                                                                                    • center left
                                                                                                                                                                                                                                                                                                                                                                    • center right
                                                                                                                                                                                                                                                                                                                                                                    • center
                                                                                                                                                                                                                                                                                                                                                                    • out: Position the legend outside the axis, on the right hand side. This is plotly's default position. Note that this shrinks the size of the axis to fit both legend and axes in the specified figsize.
                                                                                                                                                                                                                                                                                                                                                                  • The figsize parameter adjust the plot's size.

                                                                                                                                                                                                                                                                                                                                                                  • The filename parameter is used to save the plot.
                                                                                                                                                                                                                                                                                                                                                                  • The display parameter determines whether to show or return the plot.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the rows parameter for every line you want to draw, e.g., atom.plot_roc(rows=(\"train\", \"test\")), or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200}). Note that for these methods, using atom.plot_roc(rows=\"train+test\"), only plots one line with the data from both sets. See the advanced plotting example.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#aesthetics", "title": "Aesthetics", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The plot's aesthetics can be customized using the plot attributes prior to calling the plotting method, e.g., atom.title_fontsize = 30. The default values are:

                                                                                                                                                                                                                                                                                                                                                                  • palette: [\"rgb(0, 98, 98)\", \"rgb(56, 166, 165)\", \"rgb(115, 175, 72)\", \"rgb(237, 173, 8)\", \"rgb(225, 124, 5)\", \"rgb(204, 80, 62)\", \"rgb(148, 52, 110)\", \"rgb(111, 64, 112)\", \"rgb(102, 102, 102)\"]
                                                                                                                                                                                                                                                                                                                                                                  • title_fontsize: 24
                                                                                                                                                                                                                                                                                                                                                                  • label_fontsize: 16
                                                                                                                                                                                                                                                                                                                                                                  • tick_fontsize: 12

                                                                                                                                                                                                                                                                                                                                                                  Use atom's update_layout method to further customize the plot's layout using any of plotly's layout properties, e.g., atom.update_layout(template=\"plotly_dark\"). Similarly, use the update_traces method to customize the traces properties, e.g. atom.update_traces(mode=\"lines+markers\").

                                                                                                                                                                                                                                                                                                                                                                  The reset_aesthetics method allows you to reset all aesthetics to their default value. See advanced plotting for an example.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#canvas", "title": "Canvas", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Use the canvas method to draw multiple plots side by side, for example to make it easier to compare similar results. The canvas method is a @contextmanager, i.e., it's used through Python's with command. Plots in a canvas ignore the legend, figsize, filename and display parameters. Instead, specify these parameters in the canvas. If a variable is assigned to the canvas (e.g., with atom.canvas() as fig), it yields the resulting figure.

                                                                                                                                                                                                                                                                                                                                                                  For example, we can use a canvas to compare the results of a XGBoost and LightGBM model on the train and test set. We could also draw the lines for both models in the same axes, but that would clutter the plot too much. Click here for more examples.

                                                                                                                                                                                                                                                                                                                                                                  >>> from atom import ATOMClassifier\n>>> from sklearn.datasets import make_classification\n\n>>> X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n>>> atom = ATOMClassifier(X, y)\n>>> atom.run([\"XGB\", \"LGB\"])\n\n>>> with atom.canvas(2, 2, title=\"XGBoost vs LightGBM\"):\n...     atom.xgb.plot_roc(rows=\"train+test\", title=\"ROC - XGBoost\")\n...     atom.lgb.plot_roc(rows=\"train+test\", title=\"ROC - LightGBM\")\n...     atom.xgb.plot_prc(rows=\"train+test\", title=\"PRC - XGBoost\")\n...     atom.lgb.plot_prc(rows=\"train+test\", title=\"PRC - LightGBM\")\n

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#shap", "title": "SHAP", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 7 of SHAP's plotting functions directly from its API. A list of available shap plots can be found here.

                                                                                                                                                                                                                                                                                                                                                                  Calculating the Shapley values is computationally expensive, especially for model agnostic explainers like Permutation. To avoid having to recalculate the values for every plot, ATOM stores the shapley values internally after the first calculation, and access them later when needed again.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  Since the plot figures are not made by ATOM, note the following:

                                                                                                                                                                                                                                                                                                                                                                  • It's not possible to draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot directly from a model, e.g., atom.lr.plot_shap_force().
                                                                                                                                                                                                                                                                                                                                                                  • The returned plot is a matplotlib figure, not plotly's.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#available-plots", "title": "Available plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  A list of available plots can be found hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#data-plots", "title": "Data plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  plot_componentsPlot the explained variance ratio per component.plot_correlationPlot a correlation matrix.plot_distributionPlot column distributions.plot_ngramsPlot n-gram frequencies.plot_pcaPlot the explained variance ratio vs number of components.plot_qqPlot a quantile-quantile plot.plot_relationshipsPlot pairwise relationships in a dataset.plot_rfecvPlot the rfecv results.plot_wordcloudPlot a wordcloud from the corpus.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#hyperparameter-tuning-plots", "title": "Hyperparameter tuning plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  plot_edfPlot the Empirical Distribution Function of a study.plot_hyperparameter_importancePlot a model's hyperparameter importance.plot_hyperparametersPlot hyperparameter relationships in a study.plot_parallel_coordinatePlot high-dimensional parameter relationships in a study.plot_pareto_frontPlot the Pareto front of a study.plot_slicePlot the parameter relationship in a study.plot_terminator_improvementPlot the potentials for future objective improvement.plot_timelinePlot the timeline of a study.plot_trialsPlot the hyperparameter tuning trials.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#prediction-plots", "title": "Prediction plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  plot_calibrationPlot the calibration curve for a binary classifier.plot_confusion_matrixPlot a model's confusion matrix.plot_detPlot the Detection Error Tradeoff curve.plot_errorsPlot a model's prediction errors.plot_evalsPlot evaluation curves.plot_feature_importancePlot a model's feature importance.plot_forecastPlot a time series with model forecasts.plot_gainsPlot the cumulative gains curve.plot_learning_curvePlot the learning curve: score vs number of training samples.plot_liftPlot the lift curve.plot_parshapPlot the partial correlation of shap values.plot_partial_dependencePlot the partial dependence of features.plot_permutation_importancePlot the feature permutation importance of models.plot_pipelinePlot a diagram of the pipeline.plot_prcPlot the precision-recall curve.plot_probabilitiesPlot the probability distribution of the target classes.plot_residualsPlot a model's residuals.plot_resultsPlot the model results.plot_rocPlot the Receiver Operating Characteristics curve.plot_successive_halvingPlot scores per iteration of the successive halving.plot_thresholdPlot metric performances against threshold values.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/plots/#shap-plots", "title": "Shap plots", "text": "

                                                                                                                                                                                                                                                                                                                                                                  plot_shap_barPlot SHAP's bar plot.plot_shap_beeswarmPlot SHAP's beeswarm plot.plot_shap_decisionPlot SHAP's decision plot.plot_shap_forcePlot SHAP's force plot.plot_shap_heatmapPlot SHAP's heatmap plot.plot_shap_scatterPlot SHAP's scatter plot.plot_shap_waterfallPlot SHAP's waterfall plot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/predicting/", "title": "Predicting", "text": "

                                                                                                                                                                                                                                                                                                                                                                  After training a model, you probably want to make predictions on new, unseen data. Just like a sklearn estimator, you can call the prediction methods from the model, e.g., atom.tree.predict(X).

                                                                                                                                                                                                                                                                                                                                                                  All prediction methods transform the provided data through the pipeline in the model's branch before making the predictions. Transformers that should only be applied on the training set are excluded from this step (e.g., outlier pruning or class balancing).

                                                                                                                                                                                                                                                                                                                                                                  The available prediction methods are the standard methods for estimators in sklearn's and sktime's API.

                                                                                                                                                                                                                                                                                                                                                                  For classification and regression tasks:

                                                                                                                                                                                                                                                                                                                                                                  decision_functionGet confidence scores on new data or existing rows.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.scoreGet a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                                  For forecast tasks:

                                                                                                                                                                                                                                                                                                                                                                  predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.scoreGet a metric score on new data.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  The score method return atom's metric score, not the metric returned by sklearn/sktime's score method for estimators. Use the method's metric parameter to calculate a different metric.

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  • The output of ATOM's methods are pandas objects, not numpy arrays.
                                                                                                                                                                                                                                                                                                                                                                  • The predict_proba method of some meta-estimators for multioutput tasks (such as MultioutputClassifier) return 3 dimensions, namely, a list of arrays with shape=(n_samples, n_classes). One array per target column. Since ATOM's prediction methods return pandas objects, such 3-dimensional arrays are converted to a multiindex pd.DataFrame, where the first level of the row indices are the target columns, and the second level are the classes.
                                                                                                                                                                                                                                                                                                                                                                  • The prediction results are cached after the first call to avoid consequent expensive calculations. This mechanism can increase the size of the instance for large datasets. Use the clear method to free the memory.

                                                                                                                                                                                                                                                                                                                                                                  It's also possible to get the prediction for a specific row or rows in the dataset. See the row and column selection section in the user guide to learn how to select the rows, e.g., atom.rf.predict(\"test\") or atom.rf.predict_proba(range(100)).

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  For forecast models, prediction on rows follow the ForecastingHorizon API. That means that using the row index works, but for example using atom.arima.predict(1) returns the prediction on the first row of the test set (instead of the second row of the train set).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/time_series/", "title": "Time series", "text": ""}, {"location": "user_guide/time_series/#forecast", "title": "Forecast", "text": ""}, {"location": "user_guide/time_series/#time-series-classification", "title": "Time series classification", "text": ""}, {"location": "user_guide/time_series/#time-series-regression", "title": "Time series regression", "text": ""}, {"location": "user_guide/training/", "title": "Training", "text": "

                                                                                                                                                                                                                                                                                                                                                                  The training phase is where the models are fitted on the training data. After this, you can use the plots and prediction methods to evaluate the results. The training applies the following steps for all models:

                                                                                                                                                                                                                                                                                                                                                                  1. Use hyperparameter tuning to select the optimal hyperparameters for the model (optional).
                                                                                                                                                                                                                                                                                                                                                                  2. The model is fitted on the training set using the best combination of hyperparameters found. After that, the model is evaluated on the tes set.
                                                                                                                                                                                                                                                                                                                                                                  3. Calculate various scores on the test set using a bootstrap algorithm (optional).

                                                                                                                                                                                                                                                                                                                                                                  There are three approaches to run the training.

                                                                                                                                                                                                                                                                                                                                                                  • Direct training:
                                                                                                                                                                                                                                                                                                                                                                    • DirectClassifier
                                                                                                                                                                                                                                                                                                                                                                    • DirectForecaster
                                                                                                                                                                                                                                                                                                                                                                    • DirectRegressor
                                                                                                                                                                                                                                                                                                                                                                  • Training via successive halving:
                                                                                                                                                                                                                                                                                                                                                                    • SuccessiveHalvingClassifier
                                                                                                                                                                                                                                                                                                                                                                    • SuccessiveHalvingForecaster
                                                                                                                                                                                                                                                                                                                                                                    • SuccessiveHalvingRegressor
                                                                                                                                                                                                                                                                                                                                                                  • Training via train sizing:
                                                                                                                                                                                                                                                                                                                                                                    • TrainSizingClassifier
                                                                                                                                                                                                                                                                                                                                                                    • TrainSizingForecaster
                                                                                                                                                                                                                                                                                                                                                                    • TrainSizingRegressor

                                                                                                                                                                                                                                                                                                                                                                  The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Just like the data cleaning and feature engineering classes, it's discouraged to use these classes directly. Instead, every approach can be called directly from atom through the run, successive_halving and train_sizing methods respectively.

                                                                                                                                                                                                                                                                                                                                                                  Models are called through their acronyms, e.g., atom.run(models=\"RF\") will train a RandomForest. If you want to run the same model multiple times, add a tag after the acronym to differentiate them. the tag must be separated from the accronym by an underscore.

                                                                                                                                                                                                                                                                                                                                                                  atom.run(\n    models=[\"RF_1\", \"RF_2\"],\n    est_params={\n        \"RF_1\": {\"n_estimators\": 100},\n        \"RF_2\": {\"n_estimators\": 200},\n    }\n)\n

                                                                                                                                                                                                                                                                                                                                                                  For example, this pipeline fits two Random Forest models, one with 100 and the other with 200 decision trees. The models can be accessed through atom.rf_1 and atom.rf_2. Use tagged models to test how the same model performs when fitted with different parameters or on different data sets. See the Imbalanced datasets example.

                                                                                                                                                                                                                                                                                                                                                                  Additional things to take into account:

                                                                                                                                                                                                                                                                                                                                                                  • If an exception is encountered while fitting an estimator, the pipeline will automatically jump to the next model. The exceptions are stored in the errors attribute. Note that when a model is skipped, there is no model subclass for that estimator.
                                                                                                                                                                                                                                                                                                                                                                  • When showing the final results, a ! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set).

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#metric", "title": "Metric", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM uses sklearn's scorers for model evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties , such as if a higher or lower score is better (score or loss function) or if the function needs probability estimates or rounded predictions (see the make_scorer function). The metric parameter accepts three ways of defining the scorer:

                                                                                                                                                                                                                                                                                                                                                                  • Using the name of one of the predefined scorers.
                                                                                                                                                                                                                                                                                                                                                                  • Using a function with signature function(y_true, y_pred) -> score. In this case, ATOM uses make_scorer with default parameters.
                                                                                                                                                                                                                                                                                                                                                                  • Using a scorer object.

                                                                                                                                                                                                                                                                                                                                                                  Note that all scorers follow the convention that higher return values are better than lower return values. Thus, metrics which measure the distance between the model and the data (i.e., loss functions), like max_error or mean_squared_error, will return the negated value of the metric.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#predefined-scorers", "title": "Predefined scorers", "text": "

                                                                                                                                                                                                                                                                                                                                                                  ATOM accepts all sklearn's scorers as well as some custom acronyms and custom scorers. Since some of sklearn's scorers have quite long names and ATOM is all about lazyfast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case-insensitive and can be used in the metric parameter instead of the scorer's full name, e.g., atom.run(\"LR\", metric=\"BA\") uses balanced_accuracy. The available acronyms are:

                                                                                                                                                                                                                                                                                                                                                                  • \"AP\" for \"average_precision\"
                                                                                                                                                                                                                                                                                                                                                                  • \"BA\" for \"balanced_accuracy\"
                                                                                                                                                                                                                                                                                                                                                                  • \"AUC\" for \"roc_auc\"
                                                                                                                                                                                                                                                                                                                                                                  • \"LogLoss\" for \"neg_log_loss\"
                                                                                                                                                                                                                                                                                                                                                                  • \"EV\" for \"explained_variance\"
                                                                                                                                                                                                                                                                                                                                                                  • \"ME\" for \"max_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"MAE\" for \"neg_mean_absolute_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"MSE\" for \"neg_mean_squared_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"RMSE\" for \"neg_root_mean_squared_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"MSLE\" for \"neg_mean_squared_log_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"MEDAE\" for \"neg_median_absolute_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"MAPE\" for \"neg_mean_absolute_percentage_error\"
                                                                                                                                                                                                                                                                                                                                                                  • \"POISSON\" for \"neg_mean_poisson_deviance\"
                                                                                                                                                                                                                                                                                                                                                                  • \"GAMMA\" for \"neg_mean_gamma_deviance\"

                                                                                                                                                                                                                                                                                                                                                                  ATOM also provides some extra common metrics for binary classification tasks.

                                                                                                                                                                                                                                                                                                                                                                  • \"TN\" for True Negatives
                                                                                                                                                                                                                                                                                                                                                                  • \"FP\" for False Positives
                                                                                                                                                                                                                                                                                                                                                                  • \"FN\" for False Negatives
                                                                                                                                                                                                                                                                                                                                                                  • \"TP\" for True Positives
                                                                                                                                                                                                                                                                                                                                                                  • \"FPR\" for False Positive rate (fall-out)
                                                                                                                                                                                                                                                                                                                                                                  • \"TPR\" for True Positive Rate (sensitivity, recall)
                                                                                                                                                                                                                                                                                                                                                                  • \"TNR\" for True Negative Rate (specificity)
                                                                                                                                                                                                                                                                                                                                                                  • \"FNR\" for False Negative Rate (miss rate)
                                                                                                                                                                                                                                                                                                                                                                  • \"MCC\" for Matthews Correlation Coefficient (also for multiclass classification)

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#multi-metric-runs", "title": "Multi-metric runs", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g., atom.run(\"LDA\", metric=[\"r2\", \"mse\"]).

                                                                                                                                                                                                                                                                                                                                                                  When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.score_train could return [0.8734, 0.6672, 0.9001]. Only the first metric of a multi-metric run (this metric is called the main metric) is used to select the winning model.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  • The winning model is retrieved comparing only the main metric.
                                                                                                                                                                                                                                                                                                                                                                  • Some plots let you choose which of the metrics in a multi-metric run to show using the metric parameter, e.g., plot_results.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#automated-feature-scaling", "title": "Automated feature scaling", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Models that require feature scaling will automatically do so before training, unless the data is sparse or already scaled. The data is considered scaled if it has one of the following prerequisites:

                                                                                                                                                                                                                                                                                                                                                                  • The mean value over the mean of all columns lies between -0.05 and 0.05 and the mean of the standard deviation over all columns lies between 0.85 and 1.15. Categorical and binary columns (only 0s and 1s) are excluded from the calculation.
                                                                                                                                                                                                                                                                                                                                                                  • There is a transformer in the pipeline whose __name__ contains the word scaler.

                                                                                                                                                                                                                                                                                                                                                                  The scaling is applied using a Scaler with default parameters. It can be accessed from the model through the scaler attribute. The scaled dataset can be examined through the model's data attributes. Use the available_models method to see which models require feature scaling. See here an example.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#in-training-validation", "title": "In-training validation", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Some predefined models allow in-training validation. This means that the estimator is evaluated (using only the main metric) on the train and test set after every round of the training (a round can be an iteration for linear models or an added tree for boosted tree models). The validation scores are stored in the evals attribute, a dictionary of the train and test performances per round (also when pruning isn't applied). Click here for an example using in-training validation.

                                                                                                                                                                                                                                                                                                                                                                  The predefined models that support in-training validation are:

                                                                                                                                                                                                                                                                                                                                                                  • CatBoost
                                                                                                                                                                                                                                                                                                                                                                  • LightGBM
                                                                                                                                                                                                                                                                                                                                                                  • MultiLayerPerceptron
                                                                                                                                                                                                                                                                                                                                                                  • PassiveAggressive
                                                                                                                                                                                                                                                                                                                                                                  • Perceptron
                                                                                                                                                                                                                                                                                                                                                                  • StochasticGradientDescent
                                                                                                                                                                                                                                                                                                                                                                  • XGBoost

                                                                                                                                                                                                                                                                                                                                                                  To apply in-training validation to a custom model, use the has_validation parameter when creating the custom model.

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  • In-training validation is not calculated during hyperparameter tuning.
                                                                                                                                                                                                                                                                                                                                                                  • CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the use_best_model=False parameter to avoid this behavior or use a holdout set to evaluate the final estimator.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the plot_evals method to visualize the in-training validation on the train and test sets.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#parameter-customization", "title": "Parameter customization", "text": "

                                                                                                                                                                                                                                                                                                                                                                  By default, every estimator uses the default parameters they get from their respective packages. To select different ones, use the est_params. parameter of the run method. There are two ways to add custom parameters to the models: adding them directly to the dictionary as key-value pairs or through dictionaries.

                                                                                                                                                                                                                                                                                                                                                                  Adding the parameters directly to est_params (or using a dict with the key 'all') shares them across all models in the trainer. In this example, both the XGBoost and the LightGBM model use 200 boosted trees. Make sure all the models do have the specified parameters or an exception will be raised!

                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=[\"XGB\", \"LGB\"], est_params={\"n_estimators\": 200})\n

                                                                                                                                                                                                                                                                                                                                                                  To specify parameters per model, use the model name as key and a dict of the parameters as value. In this example, the XGBoost model uses n_estimators=200 and the MultiLayerPerceptron uses one hidden layer with 75 neurons.

                                                                                                                                                                                                                                                                                                                                                                  atom.run(\n    models=[\"XGB\", \"MLP\"],\n    est_params={\n        \"XGB\": {\"n_estimators\": 200},\n        \"MLP\": {\"hidden_layer_sizes\": (75,)},\n    }\n)\n

                                                                                                                                                                                                                                                                                                                                                                  Some estimators allow you to pass extra parameters to the fit method (besides X and y). This can be done adding _fit at the end of the parameter. For example, to change XGBoost's verbosity, we can run:

                                                                                                                                                                                                                                                                                                                                                                  atom.run(models=\"XGB\", est_params={\"verbose_fit\": True})\n

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  If a parameter is specified through est_params, it's ignored by the study, even if it's added manually to ht_params[\"distributions\"].

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  The estimator's n_jobs and random_state parameters adopt atom's values (when available), unless specified through est_params.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#hyperparameter-tuning", "title": "Hyperparameter tuning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  In order to achieve maximum performance, it's important to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning through the optuna package. Just like optuna, we use the terms study and trial as follows:

                                                                                                                                                                                                                                                                                                                                                                  • Study: optimization based on an objective function.
                                                                                                                                                                                                                                                                                                                                                                  • Trial: a single execution of the objective function.

                                                                                                                                                                                                                                                                                                                                                                  Each trial is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub)training and validation set. This process can create some minimum data leakage towards specific parameters (since the estimator is evaluated on data that is used to train the next estimator), but it ensures maximal use of the provided data. However, the leakage is not present in the independent test set, thus the final score of every model is unbiased. Note that, if the dataset is relatively small, the tuning's best score can consistently be lower than the final score on the test set due to the considerable lower fraction of instances on which it is trained. After finishing the study, the parameters that resulted in the best score are used to fit the final model on the complete training set.

                                                                                                                                                                                                                                                                                                                                                                  Info

                                                                                                                                                                                                                                                                                                                                                                  • Unless specified differently by the user, the used samplers are TPESampler for single-metric runs and NSGAIISampler for multi-metric runs.
                                                                                                                                                                                                                                                                                                                                                                  • For multi-metric runs, the selected best trial is the trial that performed best on the main metric. Use the property's @setter to change it to any other trial. See the hyperparameter tuning example.

                                                                                                                                                                                                                                                                                                                                                                  There are many possibilities to tune the study to your liking. The main parameter is n_trials, which determine the number of trials that are performed.

                                                                                                                                                                                                                                                                                                                                                                  Extra things to take into account:

                                                                                                                                                                                                                                                                                                                                                                  • The train/validation splits are different per trial but equal for all models.
                                                                                                                                                                                                                                                                                                                                                                  • Re-evaluating the objective function at the same point (with the same hyperparameters) automatically skips the calculation and returns the same score as the equivalent trial.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  The hyperparameter tuning output can become quite wide for models with many hyperparameters. If you are working in a Jupyter Notebook, you can change the output's width running the following code in a cell:

                                                                                                                                                                                                                                                                                                                                                                  from IPython.display import display, HTML\ndisplay(HTML(\"<style>.container { width:100% !important; }</style>\"))\n

                                                                                                                                                                                                                                                                                                                                                                  Other settings can be changed through the ht_params parameter, a dictionary where every key-value combination can be used to further customize the optimization.

                                                                                                                                                                                                                                                                                                                                                                  By default, which hyperparameters are tuned and their corresponding distributions are predefined by ATOM. Use the 'distributions' key to customize these. Just like with est_params, it's possible to share the same parameters across models or use a dictionary with the model name as key to specify the parameters for every individual model. Use the key 'all' to tune some hyperparameters for all models when you also want to tune other parameters only for specific ones. The following example tunes the n_estimators parameter for both models but the max_depth parameter only for the RandomForest.

                                                                                                                                                                                                                                                                                                                                                                  atom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\"distributions\": {\"all\": \"n_estimators\", \"RF\": \"max_depth\"}},\n)\n

                                                                                                                                                                                                                                                                                                                                                                  Like the columns parameter in atom's methods, you can exclude parameters from the optimization adding ! before its name. It's possible to exclude multiple parameters, but not to combine inclusion and exclusion for the same model. For example, to optimize a RandomForest using all its predefined parameters except n_estimators, run:

                                                                                                                                                                                                                                                                                                                                                                  atom.run(\n    models=\"ET\",\n    n_trials=15,\n    ht_params={\"distributions\": \"!n_estimators\"},\n)\n

                                                                                                                                                                                                                                                                                                                                                                  If just the parameter name is provided, the predefined distribution is used. It's also possible to provide custom distributions spaces, but make sure they are compliant with optuna's API. See every model's individual documentation in ATOM's API section for an overview of their hyperparameters and distributions.

                                                                                                                                                                                                                                                                                                                                                                  from optuna.distributions import (\n    IntDistribution, FloatDistribution, CategoricalDistribution\n)\n\natom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\n        \"dimensions\": {\n            \"all\": {\"n_estimators\": IntDistribution(10, 100, step=10)},\n            \"RF\": {\n                \"max_depth\": IntDistribution(1, 10),\n                \"max_features\": CategoricalDistribution([\"sqrt\", \"log2\"]),\n           },\n        },\n    }\n)\n

                                                                                                                                                                                                                                                                                                                                                                  Parameters for optuna's study and the study's optimize method can be added as kwargs to ht_params. For example, to use a different sampler or add a custom callback.

                                                                                                                                                                                                                                                                                                                                                                  from optuna.samplers import RandomSampler\n\natom.run(\n    models=\"LR\",\n    n_trials=30,\n    ht_params={\n        \"sampler\": RandomSampler(seed=atom.random_state),\n        \"callbacks\": custom_callback(),\n    },\n)\n

                                                                                                                                                                                                                                                                                                                                                                  Note

                                                                                                                                                                                                                                                                                                                                                                  • If you use the default sampler, it\u2019s recommended to consider setting larger n_trials to make full use of the characteristics of TPESampler because TPESampler uses some (by default, 10) trials for its startup.
                                                                                                                                                                                                                                                                                                                                                                  • When specifying distributions manually, make sure to import the distribution types from optuna: from optuna.distributions import ....

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  Keras' models can only use hyperparameter tuning when n_jobs=1 or ht_params={\"cv\": 1}. Using n_jobs > 1 and cv > 1 raises a PicklingError due to incompatibilities of the APIs. Read here more about deep learning models.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  ATOM has several plots that can help you examine a model's study and trials. Have a look at them here.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#pruning", "title": "Pruning", "text": "

                                                                                                                                                                                                                                                                                                                                                                  During hyperparameter tuning, pruning stops unpromising trials at the early stages of the training (a.k.a., automated early-stopping). This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to yield the best results. A pruned trial can't be selected as best_trial. Click here to see an example that uses pruning.

                                                                                                                                                                                                                                                                                                                                                                  The study uses MedianPruner as default pruner. You can use any other of optuna's pruners through the ht_params parameter.

                                                                                                                                                                                                                                                                                                                                                                  from optuna.pruners import HyperbandPruner\n\natom.run(\"SGD\", n_trials=30, ht_params={\"pruner\": HyperbandPruner()})\n

                                                                                                                                                                                                                                                                                                                                                                  Warning

                                                                                                                                                                                                                                                                                                                                                                  • Pruning is disabled for multi-metric runs.
                                                                                                                                                                                                                                                                                                                                                                  • Pruning is only available for models that support in-training validation.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#bootstrapping", "title": "Bootstrapping", "text": "

                                                                                                                                                                                                                                                                                                                                                                  After fitting the estimator, you can assess the robustness of the model using the bootstrap technique. This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way you can get a distribution of the performance of the model. The sets are the same for every model. The number of sets can be chosen through the n_bootstrap parameter.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the plot_results method to plot the boostrap scores in a boxplot.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#successive-halving", "title": "Successive halving", "text": "

                                                                                                                                                                                                                                                                                                                                                                  Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g., only using tree-based models.

                                                                                                                                                                                                                                                                                                                                                                  Run successive halving from atom via the successive_halving method. Consecutive runs of the same model are saved with the model's acronym followed by the number of models in the run. For example, a RandomForest in a run with 4 models would become model RF4.

                                                                                                                                                                                                                                                                                                                                                                  See here a successive halving example.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.

                                                                                                                                                                                                                                                                                                                                                                  "}, {"location": "user_guide/training/#train-sizing", "title": "Train sizing", "text": "

                                                                                                                                                                                                                                                                                                                                                                  When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

                                                                                                                                                                                                                                                                                                                                                                  Run train sizing from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. Consecutive runs of the same model are saved with the model's acronym followed by the fraction of rows in the training set (the . is removed from the fraction!). For example, a RandomForest in a run with 80% of the training samples would become model RF08.

                                                                                                                                                                                                                                                                                                                                                                  See here a train sizing example.

                                                                                                                                                                                                                                                                                                                                                                  Tip

                                                                                                                                                                                                                                                                                                                                                                  Use the plot_learning_curve method to see the model's performance per size of the training set.

                                                                                                                                                                                                                                                                                                                                                                  "}]} diff --git a/docs_sources/api/plots/plot_acf.md b/docs_sources/api/plots/plot_acf.md new file mode 100644 index 000000000..01a4b44c8 --- /dev/null +++ b/docs_sources/api/plots/plot_acf.md @@ -0,0 +1,16 @@ +# plot_acf +---------- + +:: atom.plots:DataPlot.plot_acf + :: signature + :: head + :: table: + - parameters + - returns + :: see also + +
                                                                                                                                                                                                                                                                                                                                                                  + +## Example + +:: examples diff --git a/docs_sources/api/plots/plot_decomposition.md b/docs_sources/api/plots/plot_decomposition.md new file mode 100644 index 000000000..dcc3a7dfa --- /dev/null +++ b/docs_sources/api/plots/plot_decomposition.md @@ -0,0 +1,16 @@ +# plot_decomposition +-------------------- + +:: atom.plots:DataPlot.plot_decomposition + :: signature + :: head + :: table: + - parameters + - returns + :: see also + +
                                                                                                                                                                                                                                                                                                                                                                  + +## Example + +:: examples diff --git a/docs_sources/api/plots/plot_pacf.md b/docs_sources/api/plots/plot_pacf.md new file mode 100644 index 000000000..b7edbe40a --- /dev/null +++ b/docs_sources/api/plots/plot_pacf.md @@ -0,0 +1,16 @@ +# plot_pacf +----------- + +:: atom.plots:DataPlot.plot_pacf + :: signature + :: head + :: table: + - parameters + - returns + :: see also + +
                                                                                                                                                                                                                                                                                                                                                                  + +## Example + +:: examples diff --git a/docs_sources/changelog/v5.x.x.md b/docs_sources/changelog/v5.x.x.md index cfc1de2e4..a9674f0d4 100644 --- a/docs_sources/changelog/v5.x.x.md +++ b/docs_sources/changelog/v5.x.x.md @@ -1,65 +1,6 @@ # Release history ----------------- - - -## Version 6.0.0 - -**:star: New features** - -* Completely new module for time series. Read more in the [user guide][time-series]. -* Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/) - and [Python 3.9](ttps://www.python.org/downloads/release/python-390/). -* New data engines. Read more in the [user guide][data-acceleration]. -* Improved memory optimizations. Read more in the [user guide][memory-considerations]. -* Added the `iterative` strategy for [numerical imputation][imputer]. -* Added the `hdbscan` strategy to the [Pruner][] class. -* Use the [`ignore`][atomclassifier-ignore] parameter to ignore columns in the dataset. -* New [update_traces][atomclassifier-update_traces] method to further customize your plots. - -**:pencil: API changes** - -* The [plot_results][] method is divided into [plot_results][] and [plot_bootstrap][] - and accepts any metric. -* The [FeatureGrouper][] class no longer accepts a `name` parameter. Provide - the group names directly through the `group` parameter as dict. -* Rework of the [register][adaboost-register] method. -* The `multioutput` attribute is deprecated. Multioutput meta-estimators are - now assigned automatically. -* Model tags have to be separated from the acronym by an underscore. -* The [`engine`][atomclassifier-engine] parameter is now a dict. -* The `automl` method is deprecated. - -**:rocket: Enhancements** - -* Transformations only on `y` are now accepted, e.g., `atom.scale(columns=-1)`. -* Full support for [pandas nullable dtypes](https://pandas.pydata.org/docs/user_guide/integer_na.html). -* The dataset can now be provided as callable. -* The [FeatureExtractor][] class can extract features from the dataset's index. -* Subplots can now share axes on the [canvas][atomclassifier-canvas]. -* The [save][atomclassifier-save] and [save_data][atomclassifier-save_data] - methods now accept [pathlib.Path][] objects as `filename`. -* Cleaner representation on hover for the [plot_timeline][] method. -* The `cv` key in `ht_params` now accepts a custom cross-validation generator. -* Improved error message for incorrect stratification of multioutput datasets. -* Rework of the [shrink][atomclassifier-shrink] method. - -**:bug: Bug fixes** - -* Fixed a bug where the [cross_validate][adaboost-cross_validate] method could - fail for pipelines that changed the number of rows. -* Fixed a bug where the [Pruner][] class didn't drop all outlier clusters. -* Fixed a bug where the pipeline could fail for transformers that returned a - series. -* Fixed a bug where the pipeline could fail for transformers that reset its - internal attributes during fitting. -* Fixed a bug where the [register][adaboost-register] method failed in Databricks. -* Fixed a bug where tuning hyperparameter for a `base_estimator` inside a custom - meta-estimator would fail. -* Fixed a bug where the data properties' `@setter` could fail for numpy arrays. -* Fixed a bug where reference lines for some plots didn't lie exactly on the unity line. - - ## Version 5.2.0 diff --git a/docs_sources/changelog/v6.x.x.md b/docs_sources/changelog/v6.x.x.md new file mode 100644 index 000000000..a0b32c8a7 --- /dev/null +++ b/docs_sources/changelog/v6.x.x.md @@ -0,0 +1,59 @@ +# Release history +----------------- + + +## Version 6.0.0 + +**:star: New features** + +* Completely new module for time series. Read more in the [user guide][time-series]. +* Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/) + and [Python 3.9](ttps://www.python.org/downloads/release/python-390/). +* New data engines. Read more in the [user guide][data-acceleration]. +* Improved memory optimizations. Read more in the [user guide][memory-considerations]. +* Added the `iterative` strategy for [numerical imputation][imputer]. +* Added the `hdbscan` strategy to the [Pruner][] class. +* Use the [`ignore`][atomclassifier-ignore] parameter to ignore columns in the dataset. +* New [update_traces][atomclassifier-update_traces] method to further customize your plots. + +**:pencil: API changes** + +* The [plot_results][] method is divided into [plot_results][] and [plot_bootstrap][] + and accepts any metric. +* The [FeatureGrouper][] class no longer accepts a `name` parameter. Provide + the group names directly through the `group` parameter as dict. +* Rework of the [register][adaboost-register] method. +* The `multioutput` attribute is deprecated. Multioutput meta-estimators are + now assigned automatically. +* Model tags have to be separated from the acronym by an underscore. +* The [`engine`][atomclassifier-engine] parameter is now a dict. +* The `automl` method is deprecated. + +**:rocket: Enhancements** + +* Transformations only on `y` are now accepted, e.g., `atom.scale(columns=-1)`. +* Full support for [pandas nullable dtypes](https://pandas.pydata.org/docs/user_guide/integer_na.html). +* The dataset can now be provided as callable. +* The [FeatureExtractor][] class can extract features from the dataset's index. +* Subplots can now share axes on the [canvas][atomclassifier-canvas]. +* The [save][atomclassifier-save] and [save_data][atomclassifier-save_data] + methods now accept [pathlib.Path][] objects as `filename`. +* Cleaner representation on hover for the [plot_timeline][] method. +* The `cv` key in `ht_params` now accepts a custom cross-validation generator. +* Improved error message for incorrect stratification of multioutput datasets. +* Rework of the [shrink][atomclassifier-shrink] method. + +**:bug: Bug fixes** + +* Fixed a bug where the [cross_validate][adaboost-cross_validate] method could + fail for pipelines that changed the number of rows. +* Fixed a bug where the [Pruner][] class didn't drop all outlier clusters. +* Fixed a bug where the pipeline could fail for transformers that returned a + series. +* Fixed a bug where the pipeline could fail for transformers that reset its + internal attributes during fitting. +* Fixed a bug where the [register][adaboost-register] method failed in Databricks. +* Fixed a bug where tuning hyperparameter for a `base_estimator` inside a custom + meta-estimator would fail. +* Fixed a bug where the data properties' `@setter` could fail for numpy arrays. +* Fixed a bug where reference lines for some plots didn't lie exactly on the unity line. diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index 513dabbef..f9fcd089e 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -41,7 +41,7 @@ packages are necessary for its correct functioning. * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) * **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2) -* **[plotly](https://plotly.com/python/)** (>=5.15.0) +* **[plotly](https://plotly.com/python/)** (>=5.18.0) * **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.7.1) * **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) * **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0) diff --git a/mkdocs.yml b/mkdocs.yml index 2c484c99a..1df50d295 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -233,11 +233,13 @@ nav: - Pipeline: - Pipeline: API/pipeline/pipeline.md - Plots: + - plot_acf: API/plots/plot_acf.md - plot_bootstrap: API/plots/plot_bootstrap.md - plot_calibration: API/plots/plot_calibration.md - plot_components: API/plots/plot_components.md - plot_confusion_matrix: API/plots/plot_confusion_matrix.md - plot_correlation: API/plots/plot_correlation.md + - plot_decomposition: API/plots/plot_decomposition.md - plot_det: API/plots/plot_det.md - plot_distribution: API/plots/plot_distribution.md - plot_edf: API/plots/plot_edf.md @@ -251,6 +253,7 @@ nav: - plot_learning_curve: API/plots/plot_learning_curve.md - plot_lift: API/plots/plot_lift.md - plot_ngrams: API/plots/plot_ngrams.md + - plot_pacf: API/plots/plot_pacf.md - plot_parallel_coordinate: API/plots/plot_parallel_coordinate.md - plot_pareto_front: API/plots/plot_pareto_front.md - plot_parshap: API/plots/plot_parshap.md diff --git a/pyproject.toml b/pyproject.toml index d79784634..f34a6e136 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "numpy>=1.23.0", "optuna>=3.4.0", "pandas[parquet]>=2.1.2", - "plotly>=5.15.0", + "plotly>=5.18.0", "ray[serve]>=2.7.1", "requests>=2.31.0", "scikit-learn>=1.4.0", diff --git a/tests/test_plots.py b/tests/test_plots.py index 163e5b68e..1ce62ca70 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -275,6 +275,12 @@ def test_update_traces(): # Test DataPlot ==================================================== >> +def test_plot_acf(): + """Assert that the plot_acf method works.""" + atom = ATOMForecaster(y_fc, random_state=1) + atom.plot_acf(display=False) + + @pytest.mark.parametrize("show", [10, None]) def test_plot_components(show): """Assert that the plot_components method works.""" @@ -311,6 +317,12 @@ def test_plot_ngrams(ngram): atom.plot_ngrams(ngram=ngram, display=False) # When the corpus consists of tokens +def test_plot_pacf(): + """Assert that the plot_pacf method works.""" + atom = ATOMForecaster(y_fc, random_state=1) + atom.plot_pacf(display=False) + + @pytest.mark.parametrize("X", [X10, X_sparse]) def test_plot_pca(X): """Assert that the plot_pca method works."""